aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/benchmarks/CMakeLists.txt1
-rw-r--r--llvm/benchmarks/SpecialCaseListBM.cpp207
-rw-r--r--llvm/docs/LangRef.rst18
-rw-r--r--llvm/docs/ReleaseNotes.md8
-rw-r--r--llvm/include/llvm-c/Core.h2
-rw-r--r--llvm/include/llvm/ADT/Bitfields.h88
-rw-r--r--llvm/include/llvm/ADT/StringExtras.h6
-rw-r--r--llvm/include/llvm/ADT/StringSwitch.h52
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolution.h4
-rw-r--r--llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h80
-rw-r--r--llvm/include/llvm/Analysis/StaticDataProfileInfo.h18
-rw-r--r--llvm/include/llvm/CodeGen/ISDOpcodes.h6
-rw-r--r--llvm/include/llvm/CodeGen/SelectionDAG.h6
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h12
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMP.td1
-rw-r--r--llvm/include/llvm/Frontend/OpenMP/OMPKinds.def2
-rw-r--r--llvm/include/llvm/IR/ConstantFPRange.h13
-rw-r--r--llvm/include/llvm/IR/IRBuilder.h5
-rw-r--r--llvm/include/llvm/Support/DebugCounter.h3
-rw-r--r--llvm/include/llvm/Support/Format.h15
-rw-r--r--llvm/include/llvm/Support/SpecialCaseList.h16
-rw-r--r--llvm/include/llvm/TableGen/CodeGenHelpers.h29
-rw-r--r--llvm/include/llvm/Target/TargetSelectionDAG.td1
-rw-r--r--llvm/include/llvm/TargetParser/RISCVTargetParser.h2
-rw-r--r--llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h8
-rw-r--r--llvm/include/llvm/Transforms/Coroutines/SpillUtils.h9
-rw-r--r--llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h5
-rw-r--r--llvm/include/llvm/XRay/BlockIndexer.h6
-rw-r--r--llvm/include/llvm/XRay/BlockPrinter.h6
-rw-r--r--llvm/include/llvm/XRay/BlockVerifier.h6
-rw-r--r--llvm/include/llvm/XRay/FDRLogBuilder.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecordConsumer.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecordProducer.h6
-rw-r--r--llvm/include/llvm/XRay/FDRRecords.h6
-rw-r--r--llvm/include/llvm/XRay/FDRTraceExpander.h6
-rw-r--r--llvm/include/llvm/XRay/FDRTraceWriter.h6
-rw-r--r--llvm/include/llvm/XRay/FileHeaderReader.h6
-rw-r--r--llvm/include/llvm/XRay/Graph.h7
-rw-r--r--llvm/include/llvm/XRay/InstrumentationMap.h19
-rw-r--r--llvm/include/llvm/XRay/Profile.h6
-rw-r--r--llvm/include/llvm/XRay/RecordPrinter.h6
-rw-r--r--llvm/include/llvm/XRay/Trace.h6
-rw-r--r--llvm/include/llvm/XRay/XRayRecord.h6
-rw-r--r--llvm/include/llvm/XRay/YAMLXRayRecord.h18
-rw-r--r--llvm/lib/Analysis/ScalarEvolution.cpp127
-rw-r--r--llvm/lib/Analysis/StaticDataProfileInfo.cpp40
-rw-r--r--llvm/lib/CodeGen/AsmPrinter/AIXException.cpp4
-rw-r--r--llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp4
-rw-r--r--llvm/lib/CodeGen/BasicBlockPathCloning.cpp4
-rw-r--r--llvm/lib/CodeGen/BreakFalseDeps.cpp4
-rw-r--r--llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp4
-rw-r--r--llvm/lib/CodeGen/EdgeBundles.cpp11
-rw-r--r--llvm/lib/CodeGen/ExpandFp.cpp134
-rw-r--r--llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp38
-rw-r--r--llvm/lib/CodeGen/GlobalMergeFunctions.cpp10
-rw-r--r--llvm/lib/CodeGen/LiveIntervals.cpp6
-rw-r--r--llvm/lib/CodeGen/MIR2Vec.cpp13
-rw-r--r--llvm/lib/CodeGen/MIRFSDiscriminator.cpp2
-rw-r--r--llvm/lib/CodeGen/MIRNamerPass.cpp17
-rw-r--r--llvm/lib/CodeGen/MIRPrinter.cpp36
-rw-r--r--llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp9
-rw-r--r--llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp18
-rw-r--r--llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp9
-rw-r--r--llvm/lib/CodeGen/MachineFunction.cpp62
-rw-r--r--llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp8
-rw-r--r--llvm/lib/CodeGen/MachineOutliner.cpp5
-rw-r--r--llvm/lib/CodeGen/MachinePipeliner.cpp13
-rw-r--r--llvm/lib/CodeGen/MachineScheduler.cpp104
-rw-r--r--llvm/lib/CodeGen/MachineTraceMetrics.cpp7
-rw-r--r--llvm/lib/CodeGen/NonRelocatableStringpool.cpp4
-rw-r--r--llvm/lib/CodeGen/SafeStack.cpp4
-rw-r--r--llvm/lib/CodeGen/ScheduleDAGInstrs.cpp6
-rw-r--r--llvm/lib/CodeGen/ScheduleDAGPrinter.cpp80
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp17
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp49
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp7
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp1
-rw-r--r--llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp5
-rw-r--r--llvm/lib/CodeGen/StaticDataAnnotator.cpp15
-rw-r--r--llvm/lib/CodeGen/StaticDataSplitter.cpp6
-rw-r--r--llvm/lib/CodeGen/TargetLoweringBase.cpp3
-rw-r--r--llvm/lib/CodeGen/TargetRegisterInfo.cpp17
-rw-r--r--llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp2
-rw-r--r--llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp6
-rw-r--r--llvm/lib/IR/ConstantFPRange.cpp166
-rw-r--r--llvm/lib/IR/Constants.cpp7
-rw-r--r--llvm/lib/IR/Core.cpp11
-rw-r--r--llvm/lib/IR/IRBuilder.cpp13
-rw-r--r--llvm/lib/IR/Instructions.cpp16
-rw-r--r--llvm/lib/IR/Verifier.cpp2
-rw-r--r--llvm/lib/Support/DebugCounter.cpp56
-rw-r--r--llvm/lib/Support/SpecialCaseList.cpp53
-rw-r--r--llvm/lib/TableGen/Record.cpp9
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp5
-rw-r--r--llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp70
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp132
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.h31
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td28
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td17
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp8
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.td16
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td34
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp34
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXSubtarget.h34
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h66
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp20
-rw-r--r--llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp205
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrFormats.td16
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp8
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td22
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td9
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td143
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrPredicates.td36
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp21
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td68
-rw-r--r--llvm/lib/TargetParser/RISCVTargetParser.cpp4
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroCloner.h9
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroEarly.cpp2
-rw-r--r--llvm/lib/Transforms/Coroutines/CoroInternal.h9
-rw-r--r--llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp5
-rw-r--r--llvm/lib/Transforms/Coroutines/SpillUtils.cpp37
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineInternal.h2
-rw-r--r--llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp4
-rw-r--r--llvm/lib/Transforms/InstCombine/InstructionCombining.cpp5
-rw-r--r--llvm/lib/Transforms/Instrumentation/MemProfUse.cpp55
-rw-r--r--llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp43
-rw-r--r--llvm/lib/Transforms/Utils/CloneFunction.cpp67
-rw-r--r--llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp105
-rw-r--r--llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp2
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp21
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.cpp6
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp12
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h1
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h8
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp35
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp3
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp8
-rw-r--r--llvm/lib/XRay/BlockIndexer.cpp7
-rw-r--r--llvm/lib/XRay/BlockPrinter.cpp7
-rw-r--r--llvm/lib/XRay/BlockVerifier.cpp18
-rw-r--r--llvm/lib/XRay/FDRRecordProducer.cpp14
-rw-r--r--llvm/lib/XRay/FDRRecords.cpp7
-rw-r--r--llvm/lib/XRay/FDRTraceExpander.cpp7
-rw-r--r--llvm/lib/XRay/FDRTraceWriter.cpp12
-rw-r--r--llvm/lib/XRay/FileHeaderReader.cpp12
-rw-r--r--llvm/lib/XRay/LogBuilderConsumer.cpp7
-rw-r--r--llvm/lib/XRay/Profile.cpp18
-rw-r--r--llvm/lib/XRay/RecordInitializer.cpp7
-rw-r--r--llvm/lib/XRay/RecordPrinter.cpp7
-rw-r--r--llvm/lib/XRay/Trace.cpp18
-rw-r--r--llvm/test/Analysis/ScalarEvolution/ptrtoint.ll78
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir276
-rw-r--r--llvm/test/CodeGen/AArch64/adds_cmn.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve-win.mir30
-rw-r--r--llvm/test/CodeGen/AArch64/sat-add.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/win-sve.ll148
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll280
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll448
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll9323
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll875
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll988
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll1934
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll120
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll3482
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll455
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll395
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/frem.ll1300
-rw-r--r--llvm/test/CodeGen/AMDGPU/function-args.ll190
-rw-r--r--llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll152
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/sched.group.classification.mir59
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir9
-rw-r--r--llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir7
-rw-r--r--llvm/test/CodeGen/ARM/carry.ll87
-rw-r--r--llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir2
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-commit.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-cp.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-fence.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-ld.ll4
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-shift.ll2
-rw-r--r--llvm/test/CodeGen/NVPTX/tcgen05-st.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll4
-rw-r--r--llvm/test/CodeGen/RISCV/i64-icmp.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir523
-rw-r--r--llvm/test/CodeGen/RISCV/select-to-and-zext.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/setcc-logic.ll5
-rw-r--r--llvm/test/CodeGen/RISCV/sext-zext-trunc.ll6
-rw-r--r--llvm/test/CodeGen/RISCV/xaluo.ll12
-rw-r--r--llvm/test/CodeGen/Thumb2/carry.ll59
-rw-r--r--llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll48
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll106
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll1309
-rw-r--r--llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll67
-rw-r--r--llvm/test/CodeGen/X86/avg.ll74
-rw-r--r--llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll13
-rw-r--r--llvm/test/CodeGen/X86/global-variable-partition.ll18
-rw-r--r--llvm/test/CodeGen/X86/relptr-rodata.ll15
-rw-r--r--llvm/test/CodeGen/X86/setcc-wide-types.ll155
-rw-r--r--llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll2
-rw-r--r--llvm/test/MC/WebAssembly/simd-encodings.s8
-rw-r--r--llvm/test/Other/debugcounter-dce.ll10
-rw-r--r--llvm/test/TableGen/listsplat.td6
-rw-r--r--llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll280
-rw-r--r--llvm/test/Transforms/InstCombine/add-sitofp.ll7
-rw-r--r--llvm/test/Transforms/InstCombine/binop-itofp.ll22
-rw-r--r--llvm/test/Transforms/InstCombine/ptrtoaddr.ll5
-rw-r--r--llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll103
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll539
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll312
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll9
-rw-r--r--llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll9
-rw-r--r--llvm/test/Transforms/PGOProfile/data-access-profile.ll83
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll16
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll27
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll8
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll6
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll9
-rw-r--r--llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll47
-rw-r--r--llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll14
-rw-r--r--llvm/test/Verifier/llvm.used-invalid-init.ll2
-rw-r--r--llvm/tools/bugpoint/BugDriver.cpp18
-rw-r--r--llvm/tools/bugpoint/BugDriver.h37
-rw-r--r--llvm/tools/bugpoint/CrashDebugger.cpp124
-rw-r--r--llvm/tools/bugpoint/ExecutionDriver.cpp57
-rw-r--r--llvm/tools/bugpoint/ExtractFunction.cpp64
-rw-r--r--llvm/tools/bugpoint/Miscompilation.cpp40
-rw-r--r--llvm/tools/bugpoint/OptimizerDriver.cpp2
-rw-r--r--llvm/tools/bugpoint/ToolRunner.cpp32
-rw-r--r--llvm/tools/bugpoint/bugpoint.cpp2
-rw-r--r--llvm/tools/llvm-jitlink/llvm-jitlink.cpp5
-rw-r--r--llvm/unittests/ADT/BitFieldsTest.cpp4
-rw-r--r--llvm/unittests/ADT/StringExtrasTest.cpp6
-rw-r--r--llvm/unittests/ADT/StringSwitchTest.cpp13
-rw-r--r--llvm/unittests/Analysis/ScalarEvolutionTest.cpp12
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp4
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp10
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp7
-rw-r--r--llvm/unittests/IR/ConstantFPRangeTest.cpp176
-rw-r--r--llvm/unittests/IR/InstructionsTest.cpp40
-rw-r--r--llvm/unittests/Support/CMakeLists.txt1
-rw-r--r--llvm/unittests/Support/Format.cpp56
-rw-r--r--llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp51
-rw-r--r--llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp220
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp9
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn3
-rw-r--r--llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn20
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn1
-rw-r--r--llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn1
-rw-r--r--llvm/utils/profcheck-xfail.txt77
267 files changed, 16380 insertions, 13088 deletions
diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt
index 3cbfb0d..e411ed4 100644
--- a/llvm/benchmarks/CMakeLists.txt
+++ b/llvm/benchmarks/CMakeLists.txt
@@ -11,6 +11,7 @@ add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(MustacheBench Mustache.cpp PARTIAL_SOURCES_INTENDED)
+add_benchmark(SpecialCaseListBM SpecialCaseListBM.cpp PARTIAL_SOURCES_INTENDED)
add_benchmark(RuntimeLibcallsBench RuntimeLibcalls.cpp PARTIAL_SOURCES_INTENDED)
diff --git a/llvm/benchmarks/SpecialCaseListBM.cpp b/llvm/benchmarks/SpecialCaseListBM.cpp
new file mode 100644
index 0000000..00aa3cd
--- /dev/null
+++ b/llvm/benchmarks/SpecialCaseListBM.cpp
@@ -0,0 +1,207 @@
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+constexpr int RNG_SEED = 123456;
+constexpr int MAX_LIST_MIN = 10;
+constexpr int MAX_LIST_MAX = 1000000;
+constexpr int MAX_LIST_MUL = 10;
+
+std::unique_ptr<SpecialCaseList> makeSpecialCaseList(StringRef List) {
+ std::string Error;
+ std::unique_ptr<MemoryBuffer> MB = MemoryBuffer::getMemBuffer(List);
+ auto SCL = SpecialCaseList::create(MB.get(), Error);
+ assert(SCL);
+ assert(Error == "");
+ return SCL;
+}
+
+static const std::string Dictionary[] = {
+ "orange", "tabby", "tortie", "tuxedo", "void",
+ "multiple", "spaces", "cute", "fluffy", "kittens",
+};
+
+std::vector<std::string> genFiles(size_t NumFiles) {
+ std::vector<std::string> R;
+ R.reserve(NumFiles);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> DepthDistrib(8, 16);
+ std::uniform_int_distribution<> WordDistrib(0, std::size(Dictionary) - 1);
+
+ std::string S;
+ for (size_t I = 0; I < NumFiles; ++I) {
+ for (size_t D = DepthDistrib(Rng); D; --D) {
+ S += Dictionary[WordDistrib(Rng)];
+ if (D > 1)
+ S += "/";
+ }
+ R.push_back(std::move(S));
+ S.clear();
+ }
+ return R;
+}
+
+std::string genGlobNone(const std::vector<std::string> &Files) {
+ std::string S;
+ for (const auto &F : Files) {
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobInMid(const std::vector<std::string> &Files) {
+ std::string S;
+ std::minstd_rand Rng(RNG_SEED);
+ for (std::string F : Files) {
+ std::uniform_int_distribution<> PosDistrib(0, F.size() - 1);
+ F[PosDistrib(Rng)] = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtStart(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.front() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtEnd(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.back() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+std::string genGlobAtBothSides(const std::vector<std::string> &Files) {
+ std::string S;
+ for (std::string F : Files) {
+ F.back() = '*';
+ F.front() = '*';
+ S += "src:";
+ S += F;
+ S += "\n";
+ }
+ return S;
+}
+
+void BM_Make_(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ for (auto _ : state) {
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ benchmark::DoNotOptimize(SCL);
+ }
+}
+void BM_True_(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> LineDistrib(0, BigFileList.size() - 1);
+ for (auto _ : state) {
+ auto &Q = BigFileList[LineDistrib(Rng)];
+ bool R = SCL->inSection("", "src", Q);
+ if (!R)
+ abort();
+ benchmark::DoNotOptimize(R);
+ }
+}
+
+void BM_False(
+ benchmark::State &state,
+ std::string (*GenerateCaseList)(const std::vector<std::string> &Files)) {
+ std::vector<std::string> BigFileList = genFiles(state.range(0));
+ std::string BigCaseList = GenerateCaseList(BigFileList);
+ auto SCL = makeSpecialCaseList(BigCaseList);
+ std::minstd_rand Rng(RNG_SEED);
+ std::uniform_int_distribution<> LineDistrib(0, BigFileList.size() - 1);
+ for (auto _ : state) {
+ std::string Q = BigFileList[LineDistrib(Rng)];
+ std::uniform_int_distribution<> PosDistrib(0, Q.size() - 1);
+ Q[PosDistrib(Rng)] = '_';
+ bool R = SCL->inSection("", "src", Q);
+ benchmark::DoNotOptimize(R);
+ }
+}
+
+} // namespace
+
+BENCHMARK_CAPTURE(BM_Make_, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_Make_, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_CAPTURE(BM_True_, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_True_, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_CAPTURE(BM_False, None_, genGlobNone)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Start, genGlobAtStart)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, End__, genGlobAtEnd)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Mid__, genGlobInMid)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+BENCHMARK_CAPTURE(BM_False, Both_, genGlobAtBothSides)
+ ->RangeMultiplier(MAX_LIST_MUL)
+ ->Range(MAX_LIST_MIN, MAX_LIST_MAX);
+
+BENCHMARK_MAIN();
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8b6c25c..4884e2d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -21074,12 +21074,12 @@ Overview:
The '``llvm.matrix.column.major.load.*``' intrinsics load a ``<Rows> x <Cols>``
matrix using a stride of ``%Stride`` to compute the start address of the
-different columns. The offset is computed using ``%Stride``'s bitwidth. This
-allows for convenient loading of sub matrixes. If ``<IsVolatile>`` is true, the
-intrinsic is considered a :ref:`volatile memory access <volatile>`. The result
-matrix is returned in the result vector. If the ``%Ptr`` argument is known to
-be aligned to some boundary, this can be specified as an attribute on the
-argument.
+different columns. This allows for convenient loading of sub matrixes.
+Independent of ``%Stride``'s bitwidth, the offset is computed using the target
+daya layout's pointer index type. If ``<IsVolatile>`` is true, the intrinsic is
+considered a :ref:`volatile memory access <volatile>`. The result matrix is
+returned in the result vector. If the ``%Ptr`` argument is known to be aligned
+to some boundary, this can be specified as an attribute on the argument.
Arguments:
""""""""""
@@ -21114,9 +21114,9 @@ Overview:
The '``llvm.matrix.column.major.store.*``' intrinsics store the ``<Rows> x
<Cols>`` matrix in ``%In`` to memory using a stride of ``%Stride`` between
-columns. The offset is computed using ``%Stride``'s bitwidth. If
-``<IsVolatile>`` is true, the intrinsic is considered a
-:ref:`volatile memory access <volatile>`.
+columns. Independent of ``%Stride``'s bitwidth, the offset is computed using
+the target daya layout's pointer index type. If ``<IsVolatile>`` is true, the
+intrinsic is considered a :ref:`volatile memory access <volatile>`.
If the ``%Ptr`` argument is known to be aligned to some boundary, this can be
specified as an attribute on the argument.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 79d93d0..640516a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -134,6 +134,8 @@ Changes to the WebAssembly Backend
Changes to the Windows Target
-----------------------------
+* `-fpseudo-probe-for-profiling` is now supported for COFF.
+
Changes to the X86 Backend
--------------------------
@@ -147,6 +149,7 @@ Changes to the C API
--------------------
* Add `LLVMGetOrInsertFunction` to get or insert a function, replacing the combination of `LLVMGetNamedFunction` and `LLVMAddFunction`.
+* Allow `LLVMGetVolatile` to work with any kind of Instruction.
Changes to the CodeGen infrastructure
-------------------------------------
@@ -160,6 +163,8 @@ Changes to the Debug Info
Changes to the LLVM tools
---------------------------------
+* `llvm-profgen` now supports decoding pseudo probe for COFF binaries.
+
* `llvm-readelf` now dumps all hex format values in lower-case mode.
* Some code paths for supporting Python 2.7 in `llvm-lit` have been removed.
* Support for `%T` in lit has been removed.
@@ -169,6 +174,9 @@ Changes to LLDB
* LLDB can now set breakpoints, show backtraces, and display variables when
debugging Wasm with supported runtimes (WAMR and V8).
+* The `show-progress` setting, which became a NOOP with the introduction of the
+ statusline, now defaults to off and controls using OSC escape codes to show a
+ native progress bar in supporting terminals like Ghostty and ConEmu.
Changes to BOLT
---------------------------------
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 3d22f859..4e380d9 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -4757,7 +4757,7 @@ LLVM_C_ABI LLVMValueRef LLVMBuildGlobalString(LLVMBuilderRef B, const char *Str,
LLVM_C_ABI LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B,
const char *Str,
const char *Name);
-LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef MemoryAccessInst);
+LLVM_C_ABI LLVMBool LLVMGetVolatile(LLVMValueRef Inst);
LLVM_C_ABI void LLVMSetVolatile(LLVMValueRef MemoryAccessInst,
LLVMBool IsVolatile);
LLVM_C_ABI LLVMBool LLVMGetWeak(LLVMValueRef CmpXchgInst);
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 4064d71..1af2761 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -86,89 +86,43 @@
#include <limits> // numeric_limits
#include <type_traits>
+#include "llvm/Support/MathExtras.h"
+
namespace llvm {
namespace bitfields_details {
-/// A struct defining useful bit patterns for n-bits integer types.
-template <typename T, unsigned Bits> struct BitPatterns {
- /// Bit patterns are forged using the equivalent `Unsigned` type because of
- /// undefined operations over signed types (e.g. Bitwise shift operators).
- /// Moreover same size casting from unsigned to signed is well defined but not
- /// the other way around.
- using Unsigned = std::make_unsigned_t<T>;
- static_assert(sizeof(Unsigned) == sizeof(T), "Types must have same size");
-
- static constexpr unsigned TypeBits = sizeof(Unsigned) * CHAR_BIT;
- static_assert(TypeBits >= Bits, "n-bit must fit in T");
-
- /// e.g. with TypeBits == 8 and Bits == 6.
- static constexpr Unsigned AllZeros = Unsigned(0); // 00000000
- static constexpr Unsigned AllOnes = ~Unsigned(0); // 11111111
- static constexpr Unsigned Umin = AllZeros; // 00000000
- static constexpr Unsigned Umax = AllOnes >> (TypeBits - Bits); // 00111111
- static constexpr Unsigned SignBitMask = Unsigned(1) << (Bits - 1); // 00100000
- static constexpr Unsigned Smax = Umax >> 1U; // 00011111
- static constexpr Unsigned Smin = ~Smax; // 11100000
- static constexpr Unsigned SignExtend = Unsigned(Smin << 1U); // 11000000
-};
-
-/// `Compressor` is used to manipulate the bits of a (possibly signed) integer
-/// type so it can be packed and unpacked into a `bits` sized integer,
-/// `Compressor` is specialized on signed-ness so no runtime cost is incurred.
-/// The `pack` method also checks that the passed in `UserValue` is valid.
-template <typename T, unsigned Bits, bool = std::is_unsigned<T>::value>
-struct Compressor {
- static_assert(std::is_unsigned<T>::value, "T must be unsigned");
- using BP = BitPatterns<T, Bits>;
-
- static T pack(T UserValue, T UserMaxValue) {
- assert(UserValue <= UserMaxValue && "value is too big");
- assert(UserValue <= BP::Umax && "value is too big");
- return UserValue;
- }
-
- static T unpack(T StorageValue) { return StorageValue; }
-};
-
-template <typename T, unsigned Bits> struct Compressor<T, Bits, false> {
- static_assert(std::is_signed<T>::value, "T must be signed");
- using BP = BitPatterns<T, Bits>;
-
- static T pack(T UserValue, T UserMaxValue) {
- assert(UserValue <= UserMaxValue && "value is too big");
- assert(UserValue <= T(BP::Smax) && "value is too big");
- assert(UserValue >= T(BP::Smin) && "value is too small");
- if (UserValue < 0)
- UserValue &= ~BP::SignExtend;
- return UserValue;
- }
-
- static T unpack(T StorageValue) {
- if (StorageValue >= T(BP::SignBitMask))
- StorageValue |= BP::SignExtend;
- return StorageValue;
- }
-};
-
/// Impl is where Bifield description and Storage are put together to interact
/// with values.
template <typename Bitfield, typename StorageType> struct Impl {
static_assert(std::is_unsigned<StorageType>::value,
"Storage must be unsigned");
using IntegerType = typename Bitfield::IntegerType;
- using C = Compressor<IntegerType, Bitfield::Bits>;
- using BP = BitPatterns<StorageType, Bitfield::Bits>;
static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
- static constexpr StorageType Mask = BP::Umax << Bitfield::Shift;
+ static constexpr StorageType LowMask =
+ maskTrailingOnes<StorageType>(Bitfield::Bits);
+ static constexpr StorageType Mask = LowMask << Bitfield::Shift;
+
+ /// Validates that `UserValue` fits within the bitfield's range.
+ static void checkValue(IntegerType UserValue, IntegerType UserMaxValue) {
+ assert(UserValue <= UserMaxValue && "value is too big");
+ if constexpr (std::is_unsigned_v<IntegerType>) {
+ assert(isUInt<Bitfield::Bits>(UserValue) && "value is too big");
+ } else {
+ static_assert(std::is_signed_v<IntegerType>,
+ "IntegerType must be signed");
+ assert(isInt<Bitfield::Bits>(UserValue) && "value is out of range");
+ }
+ }
/// Checks `UserValue` is within bounds and packs it between `FirstBit` and
/// `LastBit` of `Packed` leaving the rest unchanged.
static void update(StorageType &Packed, IntegerType UserValue) {
- const StorageType StorageValue = C::pack(UserValue, Bitfield::UserMaxValue);
+ checkValue(UserValue, Bitfield::UserMaxValue);
+ const StorageType StorageValue = UserValue & LowMask;
Packed &= ~Mask;
Packed |= StorageValue << Bitfield::Shift;
}
@@ -177,7 +131,9 @@ template <typename Bitfield, typename StorageType> struct Impl {
/// an`IntegerType`.
static IntegerType extract(StorageType Packed) {
const StorageType StorageValue = (Packed & Mask) >> Bitfield::Shift;
- return C::unpack(StorageValue);
+ if constexpr (std::is_signed_v<IntegerType>)
+ return SignExtend64<Bitfield::Bits>(StorageValue);
+ return StorageValue;
}
/// Interprets bits between `FirstBit` and `LastBit` of `Packed` as
diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h
index 7d81c63..2440e76 100644
--- a/llvm/include/llvm/ADT/StringExtras.h
+++ b/llvm/include/llvm/ADT/StringExtras.h
@@ -529,13 +529,15 @@ inline std::string join_items(Sep Separator, Args &&... Items) {
class ListSeparator {
bool First = true;
StringRef Separator;
+ StringRef Prefix;
public:
- ListSeparator(StringRef Separator = ", ") : Separator(Separator) {}
+ ListSeparator(StringRef Separator = ", ", StringRef Prefix = "")
+ : Separator(Separator), Prefix(Prefix) {}
operator StringRef() {
if (First) {
First = false;
- return {};
+ return Prefix;
}
return Separator;
}
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 0ce7c57a..a96535c 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -17,6 +17,7 @@
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
#include <cstring>
+#include <initializer_list>
#include <optional>
namespace llvm {
@@ -85,55 +86,60 @@ public:
return *this;
}
+ StringSwitch &Cases(std::initializer_list<StringLiteral> CaseStrings,
+ T Value) {
+ return CasesImpl(Value, CaseStrings);
+ }
+
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, T Value) {
- return CasesImpl(Value, S0, S1);
+ return CasesImpl(Value, {S0, S1});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
T Value) {
- return CasesImpl(Value, S0, S1, S2);
+ return CasesImpl(Value, {S0, S1, S2});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3);
+ return CasesImpl(Value, {S0, S1, S2, S3});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4, S5});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, StringLiteral S8,
T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8});
}
StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, StringLiteral S5,
StringLiteral S6, StringLiteral S7, StringLiteral S8,
StringLiteral S9, T Value) {
- return CasesImpl(Value, S0, S1, S2, S3, S4, S5, S6, S7, S8, S9);
+ return CasesImpl(Value, {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9});
}
// Case-insensitive case matchers.
@@ -156,23 +162,28 @@ public:
return *this;
}
+ StringSwitch &CasesLower(std::initializer_list<StringLiteral> CaseStrings,
+ T Value) {
+ return CasesLowerImpl(Value, CaseStrings);
+ }
+
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, T Value) {
- return CasesLowerImpl(Value, S0, S1);
+ return CasesLowerImpl(Value, {S0, S1});
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
T Value) {
- return CasesLowerImpl(Value, S0, S1, S2);
+ return CasesLowerImpl(Value, {S0, S1, S2});
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, T Value) {
- return CasesLowerImpl(Value, S0, S1, S2, S3);
+ return CasesLowerImpl(Value, {S0, S1, S2, S3});
}
StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
StringLiteral S3, StringLiteral S4, T Value) {
- return CasesLowerImpl(Value, S0, S1, S2, S3, S4);
+ return CasesLowerImpl(Value, {S0, S1, S2, S3, S4});
}
[[nodiscard]] R Default(T Value) {
@@ -211,16 +222,21 @@ private:
return false;
}
- template <typename... Args> StringSwitch &CasesImpl(T &Value, Args... Cases) {
+ StringSwitch &CasesImpl(T &Value,
+ std::initializer_list<StringLiteral> Cases) {
// Stop matching after the string is found.
- (... || CaseImpl(Value, Cases));
+ for (StringLiteral S : Cases)
+ if (CaseImpl(Value, S))
+ break;
return *this;
}
- template <typename... Args>
- StringSwitch &CasesLowerImpl(T &Value, Args... Cases) {
+ StringSwitch &CasesLowerImpl(T &Value,
+ std::initializer_list<StringLiteral> Cases) {
// Stop matching after the string is found.
- (... || CaseLowerImpl(Value, Cases));
+ for (StringLiteral S : Cases)
+ if (CaseLowerImpl(Value, S))
+ break;
return *this;
}
};
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 8876e4e..e5a6c8c 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2316,10 +2316,6 @@ private:
/// an add rec on said loop.
void getUsedLoops(const SCEV *S, SmallPtrSetImpl<const Loop *> &LoopsUsed);
- /// Try to match the pattern generated by getURemExpr(A, B). If successful,
- /// Assign A and B to LHS and RHS, respectively.
- LLVM_ABI bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
-
/// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
/// `UniqueSCEVs`. Return if found, else nullptr.
SCEV *findExistingSCEVInCache(SCEVTypes SCEVType, ArrayRef<const SCEV *> Ops);
diff --git a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
index 164b46b..871028d 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolutionPatternMatch.h
@@ -182,6 +182,12 @@ m_scev_PtrToInt(const Op0_t &Op0) {
return SCEVUnaryExpr_match<SCEVPtrToIntExpr, Op0_t>(Op0);
}
+template <typename Op0_t>
+inline SCEVUnaryExpr_match<SCEVTruncateExpr, Op0_t>
+m_scev_Trunc(const Op0_t &Op0) {
+ return m_scev_Unary<SCEVTruncateExpr>(Op0);
+}
+
/// Match a binary SCEV.
template <typename SCEVTy, typename Op0_t, typename Op1_t,
SCEV::NoWrapFlags WrapFlags = SCEV::FlagAnyWrap,
@@ -246,6 +252,80 @@ m_scev_UDiv(const Op0_t &Op0, const Op1_t &Op1) {
return m_scev_Binary<SCEVUDivExpr>(Op0, Op1);
}
+/// Match unsigned remainder pattern.
+/// Matches patterns generated by getURemExpr.
+template <typename Op0_t, typename Op1_t> struct SCEVURem_match {
+ Op0_t Op0;
+ Op1_t Op1;
+ ScalarEvolution &SE;
+
+ SCEVURem_match(Op0_t Op0, Op1_t Op1, ScalarEvolution &SE)
+ : Op0(Op0), Op1(Op1), SE(SE) {}
+
+ bool match(const SCEV *Expr) const {
+ if (Expr->getType()->isPointerTy())
+ return false;
+
+ // Try to match 'zext (trunc A to iB) to iY', which is used
+ // for URem with constant power-of-2 second operands. Make sure the size of
+ // the operand A matches the size of the whole expressions.
+ const SCEV *LHS;
+ if (SCEVPatternMatch::match(Expr, m_scev_ZExt(m_scev_Trunc(m_SCEV(LHS))))) {
+ Type *TruncTy = cast<SCEVZeroExtendExpr>(Expr)->getOperand()->getType();
+ // Bail out if the type of the LHS is larger than the type of the
+ // expression for now.
+ if (SE.getTypeSizeInBits(LHS->getType()) >
+ SE.getTypeSizeInBits(Expr->getType()))
+ return false;
+ if (LHS->getType() != Expr->getType())
+ LHS = SE.getZeroExtendExpr(LHS, Expr->getType());
+ const SCEV *RHS =
+ SE.getConstant(APInt(SE.getTypeSizeInBits(Expr->getType()), 1)
+ << SE.getTypeSizeInBits(TruncTy));
+ return Op0.match(LHS) && Op1.match(RHS);
+ }
+ const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
+ if (Add == nullptr || Add->getNumOperands() != 2)
+ return false;
+
+ const SCEV *A = Add->getOperand(1);
+ const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
+
+ if (Mul == nullptr)
+ return false;
+
+ const auto MatchURemWithDivisor = [&](const SCEV *B) {
+ // (SomeExpr + (-(SomeExpr / B) * B)).
+ if (Expr == SE.getURemExpr(A, B))
+ return Op0.match(A) && Op1.match(B);
+ return false;
+ };
+
+ // (SomeExpr + (-1 * (SomeExpr / B) * B)).
+ if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
+ return MatchURemWithDivisor(Mul->getOperand(1)) ||
+ MatchURemWithDivisor(Mul->getOperand(2));
+
+ // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
+ if (Mul->getNumOperands() == 2)
+ return MatchURemWithDivisor(Mul->getOperand(1)) ||
+ MatchURemWithDivisor(Mul->getOperand(0)) ||
+ MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(1))) ||
+ MatchURemWithDivisor(SE.getNegativeSCEV(Mul->getOperand(0)));
+ return false;
+ }
+};
+
+/// Match the mathematical pattern A - (A / B) * B, where A and B can be
+/// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
+/// for URem with constant power-of-2 second operands. It's not always easy, as
+/// A and B can be folded (imagine A is X / 2, and B is 4, A / B becomes X / 8).
+template <typename Op0_t, typename Op1_t>
+inline SCEVURem_match<Op0_t, Op1_t> m_scev_URem(Op0_t LHS, Op1_t RHS,
+ ScalarEvolution &SE) {
+ return SCEVURem_match<Op0_t, Op1_t>(LHS, RHS, SE);
+}
+
inline class_match<const Loop> m_Loop() { return class_match<const Loop>(); }
/// Match an affine SCEVAddRecExpr.
diff --git a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
index fa21eba..f06e7ce 100644
--- a/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
+++ b/llvm/include/llvm/Analysis/StaticDataProfileInfo.h
@@ -10,6 +10,24 @@
namespace llvm {
+namespace memprof {
+// Represents the eligibility status of a global variable for section prefix
+// annotation. Other than AnnotationOk, each enum value indicates a specific
+// reason for ineligibility.
+enum class AnnotationKind : uint8_t {
+ AnnotationOK,
+ DeclForLinker,
+ ExplicitSection,
+ ReservedName,
+};
+/// Returns the annotation kind of the global variable \p GV.
+AnnotationKind getAnnotationKind(const GlobalVariable &GV);
+
+/// Returns true if the annotation kind of the global variable \p GV is
+/// AnnotationOK.
+bool IsAnnotationOK(const GlobalVariable &GV);
+} // namespace memprof
+
/// A class that holds the constants that represent static data and their
/// profile information and provides methods to operate on them.
class StaticDataProfileInfo {
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c76c83d..ff3dd0d 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -514,6 +514,12 @@ enum NodeType {
/// separately rounded operations.
FMAD,
+ /// FMULADD - Performs a * b + c, with, or without, intermediate rounding.
+ /// It is expected that this will be illegal for most targets, as it usually
+ /// makes sense to split this or use an FMA. But some targets, such as
+ /// WebAssembly, can directly support these semantics.
+ FMULADD,
+
/// FCOPYSIGN(X, Y) - Return the value of X with the sign of Y. NOTE: This
/// DAG node does not require that X and Y have the same type, just that
/// they are both floating point. X and the result must have the same type.
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 62c0806..df6ce0f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1850,9 +1850,11 @@ public:
/// Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
ArrayRef<SDValue> Ops,
- const SDNodeFlags Flags);
+ const SDNodeFlags Flags,
+ bool AllowCommute = false);
LLVM_ABI SDNode *getNodeIfExists(unsigned Opcode, SDVTList VTList,
- ArrayRef<SDValue> Ops);
+ ArrayRef<SDValue> Ops,
+ bool AllowCommute = false);
/// Check if a node exists without modifying its flags.
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
index 44ef289..41c3089 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MemoryMapper.h
@@ -51,7 +51,11 @@ public:
virtual void reserve(size_t NumBytes, OnReservedFunction OnReserved) = 0;
/// Provides working memory
- virtual char *prepare(ExecutorAddr Addr, size_t ContentSize) = 0;
+ /// The LinkGraph parameter is included to allow implementations to allocate
+ /// working memory from the LinkGraph's allocator, in which case it will be
+ /// deallocated when the LinkGraph is destroyed.
+ virtual char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) = 0;
using OnInitializedFunction = unique_function<void(Expected<ExecutorAddr>)>;
@@ -92,7 +96,8 @@ public:
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override;
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override;
void deinitialize(ArrayRef<ExecutorAddr> Allocations,
OnDeinitializedFunction OnDeInitialized) override;
@@ -142,7 +147,8 @@ public:
void reserve(size_t NumBytes, OnReservedFunction OnReserved) override;
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override;
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override;
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index bba0d6e..86a9e24 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -353,6 +353,7 @@ def OMPC_Novariants : Clause<[Spelling<"novariants">]> {
}
def OMPC_NoWait : Clause<[Spelling<"nowait">]> {
let clangClass = "OMPNowaitClause";
+ let isValueOptional = true;
}
def OMP_NUMTASKS_Strict : EnumVal<"strict", 1, 1> {}
def OMP_NUMTASKS_Unknown : EnumVal<"unknown", 2, 0> { let isDefault = 1; }
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 1694a33..46b3d53 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -472,7 +472,7 @@ __OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEn
__OMP_RTL(__kmpc_target_deinit, false, Void,)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
- FuncPtrTy, VoidPtr, VoidPtrPtr, SizeTy)
+ FuncPtrTy, FuncPtrTy, VoidPtrPtr, SizeTy)
__OMP_RTL(__kmpc_for_static_loop_4, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
__OMP_RTL(__kmpc_for_static_loop_4u, false, Void, IdentPtr, VoidPtr, VoidPtr, Int32, Int32, Int32, Int8)
__OMP_RTL(__kmpc_for_static_loop_8, false, Void, IdentPtr, VoidPtr, VoidPtr, Int64, Int64, Int64, Int8)
diff --git a/llvm/include/llvm/IR/ConstantFPRange.h b/llvm/include/llvm/IR/ConstantFPRange.h
index 39dc7c1..e772095 100644
--- a/llvm/include/llvm/IR/ConstantFPRange.h
+++ b/llvm/include/llvm/IR/ConstantFPRange.h
@@ -230,6 +230,19 @@ public:
/// Return a new range representing the possible values resulting
/// from a subtraction of a value in this range and a value in \p Other.
LLVM_ABI ConstantFPRange sub(const ConstantFPRange &Other) const;
+
+ /// Return a new range representing the possible values resulting
+ /// from a multiplication of a value in this range and a value in \p Other.
+ LLVM_ABI ConstantFPRange mul(const ConstantFPRange &Other) const;
+
+ /// Return a new range representing the possible values resulting
+ /// from a division of a value in this range and a value in
+ /// \p Other.
+ LLVM_ABI ConstantFPRange div(const ConstantFPRange &Other) const;
+
+ /// Flush denormal values to zero according to the specified mode.
+ /// For dynamic mode, we return the union of all possible results.
+ LLVM_ABI void flushDenormals(DenormalMode::DenormalModeKind Mode);
};
inline raw_ostream &operator<<(raw_ostream &OS, const ConstantFPRange &CR) {
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index 041a4ce..dacda0a 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2548,6 +2548,11 @@ public:
std::optional<RoundingMode> Rounding = std::nullopt,
std::optional<fp::ExceptionBehavior> Except = std::nullopt);
+ LLVM_ABI Value *CreateSelectWithUnknownProfile(Value *C, Value *True,
+ Value *False,
+ StringRef PassName,
+ const Twine &Name = "");
+
LLVM_ABI Value *CreateSelect(Value *C, Value *True, Value *False,
const Twine &Name = "",
Instruction *MDFrom = nullptr);
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 48fc600..39a08d4 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -178,6 +178,7 @@ protected:
std::string Desc;
SmallVector<Chunk> Chunks;
};
+ bool handleCounterIncrement(CounterInfo &Info);
DenseMap<unsigned, CounterInfo> Counters;
CounterVector RegisteredCounters;
@@ -188,6 +189,8 @@ protected:
bool ShouldPrintCounter = false;
+ bool ShouldPrintCounterQueries = false;
+
bool BreakOnLast = false;
};
diff --git a/llvm/include/llvm/Support/Format.h b/llvm/include/llvm/Support/Format.h
index 34b224d..b549341 100644
--- a/llvm/include/llvm/Support/Format.h
+++ b/llvm/include/llvm/Support/Format.h
@@ -78,9 +78,20 @@ public:
/// printed, this synthesizes the string into a temporary buffer provided and
/// returns whether or not it is big enough.
+namespace detail {
+template <typename T> struct decay_if_c_char_array {
+ using type = T;
+};
+template <std::size_t N> struct decay_if_c_char_array<char[N]> {
+ using type = const char *;
+};
+template <typename T>
+using decay_if_c_char_array_t = typename decay_if_c_char_array<T>::type;
+} // namespace detail
+
template <typename... Ts>
class format_object final : public format_object_base {
- std::tuple<Ts...> Vals;
+ std::tuple<detail::decay_if_c_char_array_t<Ts>...> Vals;
template <std::size_t... Is>
int snprint_tuple(char *Buffer, unsigned BufferSize,
@@ -96,7 +107,7 @@ public:
format_object(const char *fmt, const Ts &... vals)
: format_object_base(fmt), Vals(vals...) {
static_assert(
- (std::is_scalar_v<Ts> && ...),
+ (std::is_scalar_v<detail::decay_if_c_char_array_t<Ts>> && ...),
"format can't be used with non fundamental / non pointer type");
}
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index 466e2a4..ead7655 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -115,7 +115,8 @@ protected:
// classes.
LLVM_ABI bool createInternal(const std::vector<std::string> &Paths,
vfs::FileSystem &VFS, std::string &Error);
- LLVM_ABI bool createInternal(const MemoryBuffer *MB, std::string &Error);
+ LLVM_ABI bool createInternal(const MemoryBuffer *MB, std::string &Error,
+ bool OrderBySize = false);
SpecialCaseList() = default;
SpecialCaseList(SpecialCaseList const &) = delete;
@@ -126,6 +127,8 @@ private:
class RegexMatcher {
public:
LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -144,6 +147,8 @@ private:
class GlobMatcher {
public:
LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -164,6 +169,9 @@ private:
public:
LLVM_ABI Matcher(bool UseGlobs, bool RemoveDotSlash);
+ LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
+ LLVM_ABI void preprocess(bool BySize);
+
LLVM_ABI void
match(StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const;
@@ -174,8 +182,6 @@ private:
return R;
}
- LLVM_ABI Error insert(StringRef Pattern, unsigned LineNumber);
-
std::variant<RegexMatcher, GlobMatcher> M;
bool RemoveDotSlash;
};
@@ -206,6 +212,8 @@ protected:
StringRef Category) const;
private:
+ friend class SpecialCaseList;
+ LLVM_ABI void preprocess(bool OrderBySize);
LLVM_ABI const SpecialCaseList::Matcher *
findMatcher(StringRef Prefix, StringRef Category) const;
};
@@ -222,7 +230,7 @@ private:
/// Parses just-constructed SpecialCaseList entries from a memory buffer.
LLVM_ABI bool parse(unsigned FileIdx, const MemoryBuffer *MB,
- std::string &Error);
+ std::string &Error, bool OrderBySize);
};
} // namespace llvm
diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index 7dca6a0..5b823db 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -38,28 +38,35 @@ private:
// namespace (empty for anonymous namespace) or nested namespace.
class NamespaceEmitter {
public:
- NamespaceEmitter(raw_ostream &OS, StringRef Name) : OS(OS) {
- emitNamespaceStarts(Name);
+ NamespaceEmitter(raw_ostream &OS, StringRef Name)
+ : Name(trim(Name).str()), OS(OS) {
+ OS << "namespace " << this->Name << " {\n";
}
~NamespaceEmitter() { close(); }
// Explicit function to close the namespace scopes.
void close() {
- for (StringRef NS : llvm::reverse(Namespaces))
- OS << "} // namespace " << NS << "\n";
- Namespaces.clear();
+ if (!Closed)
+ OS << "} // namespace " << Name << "\n";
+ Closed = true;
}
private:
- void emitNamespaceStarts(StringRef Name) {
- llvm::SplitString(Name, Namespaces, "::");
- for (StringRef NS : Namespaces)
- OS << "namespace " << NS << " {\n";
+ // Trim "::" prefix. If the namespace specified is ""::mlir::toy", then the
+ // generated namespace scope needs to use
+ //
+ // namespace mlir::toy {
+ // }
+ //
+ // and cannot use "namespace ::mlir::toy".
+ static StringRef trim(StringRef Name) {
+ Name.consume_front("::");
+ return Name;
}
-
- SmallVector<StringRef, 2> Namespaces;
+ std::string Name;
raw_ostream &OS;
+ bool Closed = false;
};
} // end namespace llvm
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 632be7a..07a858f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -535,6 +535,7 @@ def fdiv : SDNode<"ISD::FDIV" , SDTFPBinOp>;
def frem : SDNode<"ISD::FREM" , SDTFPBinOp>;
def fma : SDNode<"ISD::FMA" , SDTFPTernaryOp, [SDNPCommutative]>;
def fmad : SDNode<"ISD::FMAD" , SDTFPTernaryOp, [SDNPCommutative]>;
+def fmuladd : SDNode<"ISD::FMULADD" , SDTFPTernaryOp, [SDNPCommutative]>;
def fabs : SDNode<"ISD::FABS" , SDTFPUnaryOp>;
def fminnum : SDNode<"ISD::FMINNUM" , SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]>;
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index b1fca55..2ac58a5 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -161,6 +161,8 @@ inline static bool isAltFmt(unsigned VType) { return VType & 0x100; }
LLVM_ABI void printVType(unsigned VType, raw_ostream &OS);
+LLVM_ABI void printXSfmmVType(unsigned VType, raw_ostream &OS);
+
LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
LLVM_ABI std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL,
diff --git a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
index 558984f..eb2b34d 100644
--- a/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/MaterializationUtils.h
@@ -12,9 +12,7 @@
#ifndef LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
#define LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
// True if I is trivially rematerialzable, e.g. InsertElementInst
LLVM_ABI bool isTriviallyMaterializable(Instruction &I);
@@ -24,8 +22,6 @@ LLVM_ABI void
doRematerializations(Function &F, SuspendCrossingInfo &Checker,
std::function<bool(Instruction &)> IsMaterializable);
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
#endif // LLVM_TRANSFORMS_COROUTINES_MATERIALIZATIONUTILS_H
diff --git a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
index 6cdf83c0..356f9ca 100644
--- a/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
+++ b/llvm/include/llvm/Transforms/Coroutines/SpillUtils.h
@@ -13,9 +13,7 @@
#ifndef LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
#define LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
using SpillInfo = SmallMapVector<Value *, SmallVector<Instruction *, 2>, 8>;
@@ -38,6 +36,7 @@ void collectSpillsAndAllocasFromInsts(
SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
const SuspendCrossingInfo &Checker, const DominatorTree &DT,
const coro::Shape &Shape);
+
void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
const SuspendCrossingInfo &Checker);
@@ -52,8 +51,6 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &DT,
BasicBlock::iterator getSpillInsertionPt(const coro::Shape &, Value *Def,
const DominatorTree &DT);
-} // namespace coro
-
-} // namespace llvm
+} // namespace llvm::coro
#endif // LLVM_TRANSFORMS_COROUTINES_SPILLINGINFO_H
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
index 48e8c86..2db3f6d4 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterBulk.h
@@ -13,7 +13,6 @@
#ifndef LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
#define LLVM_TRANSFORMS_UTILS_SSAUPDATERBULK_H
-#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/PredIteratorCache.h"
#include "llvm/Support/Compiler.h"
@@ -79,6 +78,10 @@ public:
LLVM_ABI void
RewriteAllUses(DominatorTree *DT,
SmallVectorImpl<PHINode *> *InsertedPHIs = nullptr);
+
+ /// Rewrite all uses and simplify the inserted PHI nodes.
+ /// Use this method to preserve behavior when replacing SSAUpdater.
+ void RewriteAndOptimizeAllUses(DominatorTree &DT);
};
} // end namespace llvm
diff --git a/llvm/include/llvm/XRay/BlockIndexer.h b/llvm/include/llvm/XRay/BlockIndexer.h
index e9782da..155e6bd 100644
--- a/llvm/include/llvm/XRay/BlockIndexer.h
+++ b/llvm/include/llvm/XRay/BlockIndexer.h
@@ -19,8 +19,7 @@
#include <cstdint>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
// The BlockIndexer will gather all related records associated with a
// process+thread and group them by 'Block'.
@@ -63,7 +62,6 @@ public:
Error flush();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKINDEXER_H
diff --git a/llvm/include/llvm/XRay/BlockPrinter.h b/llvm/include/llvm/XRay/BlockPrinter.h
index caf78c5..81944a5 100644
--- a/llvm/include/llvm/XRay/BlockPrinter.h
+++ b/llvm/include/llvm/XRay/BlockPrinter.h
@@ -18,8 +18,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/RecordPrinter.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI BlockPrinter : public RecordVisitor {
enum class State {
@@ -55,7 +54,6 @@ public:
void reset() { CurrentState = State::Start; }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKPRINTER_H
diff --git a/llvm/include/llvm/XRay/BlockVerifier.h b/llvm/include/llvm/XRay/BlockVerifier.h
index b88785c..5e7b25c 100644
--- a/llvm/include/llvm/XRay/BlockVerifier.h
+++ b/llvm/include/llvm/XRay/BlockVerifier.h
@@ -16,8 +16,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI BlockVerifier : public RecordVisitor {
public:
@@ -64,7 +63,6 @@ public:
void reset();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_BLOCKVERIFIER_H
diff --git a/llvm/include/llvm/XRay/FDRLogBuilder.h b/llvm/include/llvm/XRay/FDRLogBuilder.h
index f07c446..5f7b815 100644
--- a/llvm/include/llvm/XRay/FDRLogBuilder.h
+++ b/llvm/include/llvm/XRay/FDRLogBuilder.h
@@ -10,8 +10,7 @@
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// The LogBuilder class allows for creating ad-hoc collections of records
/// through the `add<...>(...)` function. An example use of this API is in
@@ -34,7 +33,6 @@ public:
std::vector<std::unique_ptr<Record>> consume() { return std::move(Records); }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRLOGBUILDER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordConsumer.h b/llvm/include/llvm/XRay/FDRRecordConsumer.h
index 473777f..13bb711 100644
--- a/llvm/include/llvm/XRay/FDRRecordConsumer.h
+++ b/llvm/include/llvm/XRay/FDRRecordConsumer.h
@@ -15,8 +15,7 @@
#include <memory>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordConsumer {
public:
@@ -48,7 +47,6 @@ public:
Error consume(std::unique_ptr<Record> R) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDCONSUMER_H
diff --git a/llvm/include/llvm/XRay/FDRRecordProducer.h b/llvm/include/llvm/XRay/FDRRecordProducer.h
index 083b571..b953f62 100644
--- a/llvm/include/llvm/XRay/FDRRecordProducer.h
+++ b/llvm/include/llvm/XRay/FDRRecordProducer.h
@@ -14,8 +14,7 @@
#include "llvm/XRay/XRayRecord.h"
#include <memory>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordProducer {
public:
@@ -45,7 +44,6 @@ public:
Expected<std::unique_ptr<Record>> produce() override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDPRODUCER_H
diff --git a/llvm/include/llvm/XRay/FDRRecords.h b/llvm/include/llvm/XRay/FDRRecords.h
index 7ee8db6..91689cae 100644
--- a/llvm/include/llvm/XRay/FDRRecords.h
+++ b/llvm/include/llvm/XRay/FDRRecords.h
@@ -23,8 +23,7 @@
#include "llvm/Support/Error.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class RecordVisitor;
class RecordInitializer;
@@ -444,7 +443,6 @@ public:
Error visit(TypedEventRecord &) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRRECORDS_H
diff --git a/llvm/include/llvm/XRay/FDRTraceExpander.h b/llvm/include/llvm/XRay/FDRTraceExpander.h
index 197c123..ca400c9 100644
--- a/llvm/include/llvm/XRay/FDRTraceExpander.h
+++ b/llvm/include/llvm/XRay/FDRTraceExpander.h
@@ -17,8 +17,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class TraceExpander : public RecordVisitor {
// Type-erased callback for handling individual XRayRecord instances.
@@ -56,7 +55,6 @@ public:
Error flush();
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRTRACEEXPANDER_H
diff --git a/llvm/include/llvm/XRay/FDRTraceWriter.h b/llvm/include/llvm/XRay/FDRTraceWriter.h
index a3dc58e..957039d 100644
--- a/llvm/include/llvm/XRay/FDRTraceWriter.h
+++ b/llvm/include/llvm/XRay/FDRTraceWriter.h
@@ -18,8 +18,7 @@
#include "llvm/XRay/FDRRecords.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// The FDRTraceWriter allows us to hand-craft an XRay Flight Data Recorder
/// (FDR) mode log file. This is used primarily for testing, generating
@@ -50,7 +49,6 @@ private:
support::endian::Writer OS;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FDRTRACEWRITER_H
diff --git a/llvm/include/llvm/XRay/FileHeaderReader.h b/llvm/include/llvm/XRay/FileHeaderReader.h
index ecdb975..758ca29 100644
--- a/llvm/include/llvm/XRay/FileHeaderReader.h
+++ b/llvm/include/llvm/XRay/FileHeaderReader.h
@@ -19,15 +19,13 @@
#include "llvm/XRay/XRayRecord.h"
#include <cstdint>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// Convenience function for loading the file header given a data extractor at a
/// specified offset.
LLVM_ABI Expected<XRayFileHeader>
readBinaryFormatHeader(DataExtractor &HeaderExtractor, uint64_t &OffsetPtr);
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_FILEHEADERREADER_H
diff --git a/llvm/include/llvm/XRay/Graph.h b/llvm/include/llvm/XRay/Graph.h
index 07b418b..8521e09 100644
--- a/llvm/include/llvm/XRay/Graph.h
+++ b/llvm/include/llvm/XRay/Graph.h
@@ -23,8 +23,7 @@
#include "llvm/ADT/iterator.h"
#include "llvm/Support/Error.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// A Graph object represents a Directed Graph and is used in XRay to compute
/// and store function call graphs and associated statistical information.
@@ -485,6 +484,6 @@ public:
return p;
}
};
-}
-}
+} // namespace llvm::xray
+
#endif
diff --git a/llvm/include/llvm/XRay/InstrumentationMap.h b/llvm/include/llvm/XRay/InstrumentationMap.h
index b5371478..c5e7ebf 100644
--- a/llvm/include/llvm/XRay/InstrumentationMap.h
+++ b/llvm/include/llvm/XRay/InstrumentationMap.h
@@ -23,9 +23,7 @@
#include <unordered_map>
#include <vector>
-namespace llvm {
-
-namespace xray {
+namespace llvm::xray {
// Forward declare to make a friend.
class InstrumentationMap;
@@ -102,11 +100,11 @@ public:
const SledContainer &sleds() const { return Sleds; };
};
-} // end namespace xray
-
-namespace yaml {
+} // end namespace llvm::xray
-template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
+namespace llvm {
+template <>
+struct yaml::ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
static void enumeration(IO &IO, xray::SledEntry::FunctionKinds &Kind) {
IO.enumCase(Kind, "function-enter", xray::SledEntry::FunctionKinds::ENTRY);
IO.enumCase(Kind, "function-exit", xray::SledEntry::FunctionKinds::EXIT);
@@ -118,7 +116,7 @@ template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
}
};
-template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
+template <> struct yaml::MappingTraits<xray::YAMLXRaySledEntry> {
static void mapping(IO &IO, xray::YAMLXRaySledEntry &Entry) {
IO.mapRequired("id", Entry.FuncId);
IO.mapRequired("address", Entry.Address);
@@ -131,10 +129,7 @@ template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
static constexpr bool flow = true;
};
-
-} // end namespace yaml
-
-} // end namespace llvm
+} // namespace llvm
LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRaySledEntry)
diff --git a/llvm/include/llvm/XRay/Profile.h b/llvm/include/llvm/XRay/Profile.h
index e30c01e..b5b8dd2 100644
--- a/llvm/include/llvm/XRay/Profile.h
+++ b/llvm/include/llvm/XRay/Profile.h
@@ -22,8 +22,7 @@
#include <utility>
#include <vector>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class Profile;
@@ -144,7 +143,6 @@ public:
bool empty() const { return Blocks.empty(); }
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif
diff --git a/llvm/include/llvm/XRay/RecordPrinter.h b/llvm/include/llvm/XRay/RecordPrinter.h
index 5d2c277..3281221 100644
--- a/llvm/include/llvm/XRay/RecordPrinter.h
+++ b/llvm/include/llvm/XRay/RecordPrinter.h
@@ -17,8 +17,7 @@
#include "llvm/Support/raw_ostream.h"
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
class LLVM_ABI RecordPrinter : public RecordVisitor {
raw_ostream &OS;
@@ -44,7 +43,6 @@ public:
Error visit(TypedEventRecord &) override;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_RECORDPRINTER_H
diff --git a/llvm/include/llvm/XRay/Trace.h b/llvm/include/llvm/XRay/Trace.h
index 5e4e40a..13ada22 100644
--- a/llvm/include/llvm/XRay/Trace.h
+++ b/llvm/include/llvm/XRay/Trace.h
@@ -21,8 +21,7 @@
#include "llvm/Support/Error.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// A Trace object represents the records that have been loaded from XRay
/// log files generated by instrumented binaries. We encapsulate the logic of
@@ -76,7 +75,6 @@ LLVM_ABI Expected<Trace> loadTraceFile(StringRef Filename, bool Sort = false);
LLVM_ABI Expected<Trace> loadTrace(const DataExtractor &Extractor,
bool Sort = false);
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_TRACE_H
diff --git a/llvm/include/llvm/XRay/XRayRecord.h b/llvm/include/llvm/XRay/XRayRecord.h
index 238bf3d..8f3440c 100644
--- a/llvm/include/llvm/XRay/XRayRecord.h
+++ b/llvm/include/llvm/XRay/XRayRecord.h
@@ -18,8 +18,7 @@
#include <vector>
#include <string>
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
/// XRay traces all have a header providing some top-matter information useful
/// to help tools determine how to interpret the information available in the
@@ -98,7 +97,6 @@ struct XRayRecord {
std::string Data;
};
-} // namespace xray
-} // namespace llvm
+} // namespace llvm::xray
#endif // LLVM_XRAY_XRAYRECORD_H
diff --git a/llvm/include/llvm/XRay/YAMLXRayRecord.h b/llvm/include/llvm/XRay/YAMLXRayRecord.h
index 6062606..6bf4f1d 100644
--- a/llvm/include/llvm/XRay/YAMLXRayRecord.h
+++ b/llvm/include/llvm/XRay/YAMLXRayRecord.h
@@ -17,8 +17,7 @@
#include "llvm/Support/YAMLTraits.h"
#include "llvm/XRay/XRayRecord.h"
-namespace llvm {
-namespace xray {
+namespace llvm::xray {
struct YAMLXRayFileHeader {
uint16_t Version;
@@ -46,13 +45,12 @@ struct YAMLXRayTrace {
std::vector<YAMLXRayRecord> Records;
};
-} // namespace xray
-
-namespace yaml {
+} // namespace llvm::xray
+namespace llvm {
// YAML Traits
// -----------
-template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
+template <> struct yaml::ScalarEnumerationTraits<xray::RecordTypes> {
static void enumeration(IO &IO, xray::RecordTypes &Type) {
IO.enumCase(Type, "function-enter", xray::RecordTypes::ENTER);
IO.enumCase(Type, "function-exit", xray::RecordTypes::EXIT);
@@ -63,7 +61,7 @@ template <> struct ScalarEnumerationTraits<xray::RecordTypes> {
}
};
-template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayFileHeader> {
static void mapping(IO &IO, xray::YAMLXRayFileHeader &Header) {
IO.mapRequired("version", Header.Version);
IO.mapRequired("type", Header.Type);
@@ -73,7 +71,7 @@ template <> struct MappingTraits<xray::YAMLXRayFileHeader> {
}
};
-template <> struct MappingTraits<xray::YAMLXRayRecord> {
+template <> struct yaml::MappingTraits<xray::YAMLXRayRecord> {
static void mapping(IO &IO, xray::YAMLXRayRecord &Record) {
IO.mapRequired("type", Record.RecordType);
IO.mapOptional("func-id", Record.FuncId);
@@ -90,7 +88,7 @@ template <> struct MappingTraits<xray::YAMLXRayRecord> {
static constexpr bool flow = true;
};
-template <> struct MappingTraits<xray::YAMLXRayTrace> {
+template <> struct yaml::MappingTraits<llvm::xray::YAMLXRayTrace> {
static void mapping(IO &IO, xray::YAMLXRayTrace &Trace) {
// A trace file contains two parts, the header and the list of all the
// trace records.
@@ -98,8 +96,6 @@ template <> struct MappingTraits<xray::YAMLXRayTrace> {
IO.mapRequired("records", Trace.Records);
}
};
-
-} // namespace yaml
} // namespace llvm
LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRayRecord)
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index b5b4cd9..3fab6b0 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1774,7 +1774,7 @@ const SCEV *ScalarEvolution::getZeroExtendExprImpl(const SCEV *Op, Type *Ty,
{
const SCEV *LHS;
const SCEV *RHS;
- if (matchURem(Op, LHS, RHS))
+ if (match(Op, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), *this)))
return getURemExpr(getZeroExtendExpr(LHS, Ty, Depth + 1),
getZeroExtendExpr(RHS, Ty, Depth + 1));
}
@@ -2699,17 +2699,12 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
}
// Canonicalize (-1 * urem X, Y) + X --> (Y * X/Y)
- if (Ops.size() == 2) {
- const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Ops[0]);
- if (Mul && Mul->getNumOperands() == 2 &&
- Mul->getOperand(0)->isAllOnesValue()) {
- const SCEV *X;
- const SCEV *Y;
- if (matchURem(Mul->getOperand(1), X, Y) && X == Ops[1]) {
- return getMulExpr(Y, getUDivExpr(X, Y));
- }
- }
- }
+ const SCEV *Y;
+ if (Ops.size() == 2 &&
+ match(Ops[0],
+ m_scev_Mul(m_scev_AllOnes(),
+ m_scev_URem(m_scev_Specific(Ops[1]), m_SCEV(Y), *this))))
+ return getMulExpr(Y, getUDivExpr(Ops[1], Y));
// Skip past any other cast SCEVs.
while (Idx < Ops.size() && Ops[Idx]->getSCEVType() < scAddExpr)
@@ -5419,20 +5414,15 @@ static Type *isSimpleCastedPHI(const SCEV *Op, const SCEVUnknown *SymbolicPHI,
if (SourceBits != NewBits)
return nullptr;
- const SCEVSignExtendExpr *SExt = dyn_cast<SCEVSignExtendExpr>(Op);
- const SCEVZeroExtendExpr *ZExt = dyn_cast<SCEVZeroExtendExpr>(Op);
- if (!SExt && !ZExt)
- return nullptr;
- const SCEVTruncateExpr *Trunc =
- SExt ? dyn_cast<SCEVTruncateExpr>(SExt->getOperand())
- : dyn_cast<SCEVTruncateExpr>(ZExt->getOperand());
- if (!Trunc)
- return nullptr;
- const SCEV *X = Trunc->getOperand();
- if (X != SymbolicPHI)
- return nullptr;
- Signed = SExt != nullptr;
- return Trunc->getType();
+ if (match(Op, m_scev_SExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+ Signed = true;
+ return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+ }
+ if (match(Op, m_scev_ZExt(m_scev_Trunc(m_scev_Specific(SymbolicPHI))))) {
+ Signed = false;
+ return cast<SCEVCastExpr>(Op)->getOperand()->getType();
+ }
+ return nullptr;
}
static const Loop *isIntegerLoopHeaderPHI(const PHINode *PN, LoopInfo &LI) {
@@ -15415,67 +15405,6 @@ void PredicatedScalarEvolution::print(raw_ostream &OS, unsigned Depth) const {
}
}
-// Match the mathematical pattern A - (A / B) * B, where A and B can be
-// arbitrary expressions. Also match zext (trunc A to iB) to iY, which is used
-// for URem with constant power-of-2 second operands.
-// It's not always easy, as A and B can be folded (imagine A is X / 2, and B is
-// 4, A / B becomes X / 8).
-bool ScalarEvolution::matchURem(const SCEV *Expr, const SCEV *&LHS,
- const SCEV *&RHS) {
- if (Expr->getType()->isPointerTy())
- return false;
-
- // Try to match 'zext (trunc A to iB) to iY', which is used
- // for URem with constant power-of-2 second operands. Make sure the size of
- // the operand A matches the size of the whole expressions.
- if (const auto *ZExt = dyn_cast<SCEVZeroExtendExpr>(Expr))
- if (const auto *Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0))) {
- LHS = Trunc->getOperand();
- // Bail out if the type of the LHS is larger than the type of the
- // expression for now.
- if (getTypeSizeInBits(LHS->getType()) >
- getTypeSizeInBits(Expr->getType()))
- return false;
- if (LHS->getType() != Expr->getType())
- LHS = getZeroExtendExpr(LHS, Expr->getType());
- RHS = getConstant(APInt(getTypeSizeInBits(Expr->getType()), 1)
- << getTypeSizeInBits(Trunc->getType()));
- return true;
- }
- const auto *Add = dyn_cast<SCEVAddExpr>(Expr);
- if (Add == nullptr || Add->getNumOperands() != 2)
- return false;
-
- const SCEV *A = Add->getOperand(1);
- const auto *Mul = dyn_cast<SCEVMulExpr>(Add->getOperand(0));
-
- if (Mul == nullptr)
- return false;
-
- const auto MatchURemWithDivisor = [&](const SCEV *B) {
- // (SomeExpr + (-(SomeExpr / B) * B)).
- if (Expr == getURemExpr(A, B)) {
- LHS = A;
- RHS = B;
- return true;
- }
- return false;
- };
-
- // (SomeExpr + (-1 * (SomeExpr / B) * B)).
- if (Mul->getNumOperands() == 3 && isa<SCEVConstant>(Mul->getOperand(0)))
- return MatchURemWithDivisor(Mul->getOperand(1)) ||
- MatchURemWithDivisor(Mul->getOperand(2));
-
- // (SomeExpr + ((-SomeExpr / B) * B)) or (SomeExpr + ((SomeExpr / B) * -B)).
- if (Mul->getNumOperands() == 2)
- return MatchURemWithDivisor(Mul->getOperand(1)) ||
- MatchURemWithDivisor(Mul->getOperand(0)) ||
- MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(1))) ||
- MatchURemWithDivisor(getNegativeSCEV(Mul->getOperand(0)));
- return false;
-}
-
ScalarEvolution::LoopGuards
ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
BasicBlock *Header = L->getHeader();
@@ -15696,20 +15625,18 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
if (Predicate == CmpInst::ICMP_EQ && match(RHS, m_scev_Zero())) {
// If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to
// explicitly express that.
- const SCEV *URemLHS = nullptr;
+ const SCEVUnknown *URemLHS = nullptr;
const SCEV *URemRHS = nullptr;
- if (SE.matchURem(LHS, URemLHS, URemRHS)) {
- if (const SCEVUnknown *LHSUnknown = dyn_cast<SCEVUnknown>(URemLHS)) {
- auto I = RewriteMap.find(LHSUnknown);
- const SCEV *RewrittenLHS =
- I != RewriteMap.end() ? I->second : LHSUnknown;
- RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
- const auto *Multiple =
- SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
- RewriteMap[LHSUnknown] = Multiple;
- ExprsToRewrite.push_back(LHSUnknown);
- return;
- }
+ if (match(LHS,
+ m_scev_URem(m_SCEVUnknown(URemLHS), m_SCEV(URemRHS), SE))) {
+ auto I = RewriteMap.find(URemLHS);
+ const SCEV *RewrittenLHS = I != RewriteMap.end() ? I->second : URemLHS;
+ RewrittenLHS = ApplyDivisibiltyOnMinMaxExpr(RewrittenLHS, URemRHS);
+ const auto *Multiple =
+ SE.getMulExpr(SE.getUDivExpr(RewrittenLHS, URemRHS), URemRHS);
+ RewriteMap[URemLHS] = Multiple;
+ ExprsToRewrite.push_back(URemLHS);
+ return;
}
}
diff --git a/llvm/lib/Analysis/StaticDataProfileInfo.cpp b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
index b036b2d..1f751ee 100644
--- a/llvm/lib/Analysis/StaticDataProfileInfo.cpp
+++ b/llvm/lib/Analysis/StaticDataProfileInfo.cpp
@@ -6,6 +6,46 @@
#include "llvm/ProfileData/InstrProf.h"
using namespace llvm;
+
+namespace llvm {
+namespace memprof {
+// Returns true iff the global variable has custom section either by
+// __attribute__((section("name")))
+// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
+// or #pragma clang section directives
+// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
+static bool hasExplicitSectionName(const GlobalVariable &GVar) {
+ if (GVar.hasSection())
+ return true;
+
+ auto Attrs = GVar.getAttributes();
+ if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
+ Attrs.hasAttribute("relro-section") ||
+ Attrs.hasAttribute("rodata-section"))
+ return true;
+ return false;
+}
+
+AnnotationKind getAnnotationKind(const GlobalVariable &GV) {
+ if (GV.isDeclarationForLinker())
+ return AnnotationKind::DeclForLinker;
+ // Skip 'llvm.'-prefixed global variables conservatively because they are
+ // often handled specially,
+ StringRef Name = GV.getName();
+ if (Name.starts_with("llvm."))
+ return AnnotationKind::ReservedName;
+ // Respect user-specified custom data sections.
+ if (hasExplicitSectionName(GV))
+ return AnnotationKind::ExplicitSection;
+ return AnnotationKind::AnnotationOK;
+}
+
+bool IsAnnotationOK(const GlobalVariable &GV) {
+ return getAnnotationKind(GV) == AnnotationKind::AnnotationOK;
+}
+} // namespace memprof
+} // namespace llvm
+
void StaticDataProfileInfo::addConstantProfileCount(
const Constant *C, std::optional<uint64_t> Count) {
if (!Count) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 6356d71..873ac8f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -20,7 +20,7 @@
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
-namespace llvm {
+using namespace llvm;
AIXException::AIXException(AsmPrinter *A) : EHStreamer(A) {}
@@ -90,5 +90,3 @@ void AIXException::endFunction(const MachineFunction *MF) {
emitExceptionInfoTable(LSDALabel, PerSym);
}
-
-} // End of namespace llvm
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 260ce8f..93ae548 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -85,8 +85,7 @@ template <> struct llvm::DenseMapInfo<VariableID> {
using VarLocInsertPt = PointerUnion<const Instruction *, const DbgRecord *>;
-namespace std {
-template <> struct hash<VarLocInsertPt> {
+template <> struct std::hash<VarLocInsertPt> {
using argument_type = VarLocInsertPt;
using result_type = std::size_t;
@@ -94,7 +93,6 @@ template <> struct hash<VarLocInsertPt> {
return std::hash<void *>()(Arg.getOpaqueValue());
}
};
-} // namespace std
/// Helper class to build FunctionVarLocs, since that class isn't easy to
/// modify. TODO: There's not a great deal of value in the split, it could be
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
index fd7df6b..47b7a88 100644
--- a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -207,9 +207,7 @@ bool ApplyCloning(MachineFunction &MF,
}
return AnyPathsCloned;
}
-} // end anonymous namespace
-namespace llvm {
class BasicBlockPathCloning : public MachineFunctionPass {
public:
static char ID;
@@ -229,7 +227,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
};
-} // namespace llvm
+} // namespace
char BasicBlockPathCloning::ID = 0;
INITIALIZE_PASS_BEGIN(
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 28e6728..1846880 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -31,7 +31,7 @@
using namespace llvm;
-namespace llvm {
+namespace {
class BreakFalseDeps : public MachineFunctionPass {
private:
@@ -95,7 +95,7 @@ private:
void processUndefReads(MachineBasicBlock *);
};
-} // namespace llvm
+} // namespace
#define DEBUG_TYPE "break-false-deps"
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 6c2a5a7..87ada87 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -126,8 +126,7 @@ hash_code hash_value(const ComplexValue &Arg) {
} // end namespace
typedef SmallVector<struct ComplexValue, 2> ComplexValues;
-namespace llvm {
-template <> struct DenseMapInfo<ComplexValue> {
+template <> struct llvm::DenseMapInfo<ComplexValue> {
static inline ComplexValue getEmptyKey() {
return {DenseMapInfo<Value *>::getEmptyKey(),
DenseMapInfo<Value *>::getEmptyKey()};
@@ -144,7 +143,6 @@ template <> struct DenseMapInfo<ComplexValue> {
return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
}
};
-} // end namespace llvm
namespace {
template <typename T, typename IterT>
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index f4335396..50dd66f 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -81,13 +81,10 @@ void EdgeBundles::init() {
}
}
-namespace llvm {
-
/// Specialize WriteGraph, the standard implementation won't work.
-template<>
-raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
- bool ShortNames,
- const Twine &Title) {
+template <>
+raw_ostream &llvm::WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+ bool ShortNames, const Twine &Title) {
const MachineFunction *MF = G.getMachineFunction();
O << "digraph {\n";
@@ -107,8 +104,6 @@ raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
return O;
}
-} // end namespace llvm
-
/// view - Visualize the annotated bipartite CFG with Graphviz.
void EdgeBundles::view() const {
ViewGraph(*this, "EdgeBundles");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 9cc6c6a..04c7008 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -82,7 +82,7 @@ public:
}
static FRemExpander create(IRBuilder<> &B, Type *Ty) {
- assert(canExpandType(Ty));
+ assert(canExpandType(Ty) && "Expected supported floating point type");
// The type to use for the computation of the remainder. This may be
// wider than the input/result type which affects the ...
@@ -356,8 +356,9 @@ Value *FRemExpander::buildFRem(Value *X, Value *Y,
static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
LLVM_DEBUG(dbgs() << "Expanding instruction: " << I << '\n');
- Type *ReturnTy = I.getType();
- assert(FRemExpander::canExpandType(ReturnTy->getScalarType()));
+ Type *Ty = I.getType();
+ assert(FRemExpander::canExpandType(Ty) &&
+ "Expected supported floating point type");
FastMathFlags FMF = I.getFastMathFlags();
// TODO Make use of those flags for optimization?
@@ -368,32 +369,10 @@ static bool expandFRem(BinaryOperator &I, std::optional<SimplifyQuery> &SQ) {
B.setFastMathFlags(FMF);
B.SetCurrentDebugLocation(I.getDebugLoc());
- Type *ElemTy = ReturnTy->getScalarType();
- const FRemExpander Expander = FRemExpander::create(B, ElemTy);
-
- Value *Ret;
- if (ReturnTy->isFloatingPointTy())
- Ret = FMF.approxFunc()
- ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
- : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
- else {
- auto *VecTy = cast<FixedVectorType>(ReturnTy);
-
- // This could use SplitBlockAndInsertForEachLane but the interface
- // is a bit awkward for a constant number of elements and it will
- // boil down to the same code.
- // TODO Expand the FRem instruction only once and reuse the code.
- Value *Nums = I.getOperand(0);
- Value *Denums = I.getOperand(1);
- Ret = PoisonValue::get(I.getType());
- for (int I = 0, E = VecTy->getNumElements(); I != E; ++I) {
- Value *Num = B.CreateExtractElement(Nums, I);
- Value *Denum = B.CreateExtractElement(Denums, I);
- Value *Rem = FMF.approxFunc() ? Expander.buildApproxFRem(Num, Denum)
- : Expander.buildFRem(Num, Denum, SQ);
- Ret = B.CreateInsertElement(Ret, Rem, I);
- }
- }
+ const FRemExpander Expander = FRemExpander::create(B, Ty);
+ Value *Ret = FMF.approxFunc()
+ ? Expander.buildApproxFRem(I.getOperand(0), I.getOperand(1))
+ : Expander.buildFRem(I.getOperand(0), I.getOperand(1), SQ);
I.replaceAllUsesWith(Ret);
Ret->takeName(&I);
@@ -939,7 +918,8 @@ static void expandIToFP(Instruction *IToFP) {
IToFP->eraseFromParent();
}
-static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
+static void scalarize(Instruction *I,
+ SmallVectorImpl<Instruction *> &Worklist) {
VectorType *VTy = cast<FixedVectorType>(I->getType());
IRBuilder<> Builder(I);
@@ -948,12 +928,25 @@ static void scalarize(Instruction *I, SmallVectorImpl<Instruction *> &Replace) {
Value *Result = PoisonValue::get(VTy);
for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
Value *Ext = Builder.CreateExtractElement(I->getOperand(0), Idx);
- Value *Cast = Builder.CreateCast(cast<CastInst>(I)->getOpcode(), Ext,
- I->getType()->getScalarType());
- Result = Builder.CreateInsertElement(Result, Cast, Idx);
- if (isa<Instruction>(Cast))
- Replace.push_back(cast<Instruction>(Cast));
+
+ Value *NewOp = nullptr;
+ if (auto *BinOp = dyn_cast<BinaryOperator>(I))
+ NewOp = Builder.CreateBinOp(
+ BinOp->getOpcode(), Ext,
+ Builder.CreateExtractElement(I->getOperand(1), Idx));
+ else if (auto *CastI = dyn_cast<CastInst>(I))
+ NewOp = Builder.CreateCast(CastI->getOpcode(), Ext,
+ I->getType()->getScalarType());
+ else
+ llvm_unreachable("Unsupported instruction type");
+
+ Result = Builder.CreateInsertElement(Result, NewOp, Idx);
+ if (auto *ScalarizedI = dyn_cast<Instruction>(NewOp)) {
+ ScalarizedI->copyIRFlags(I, true);
+ Worklist.push_back(ScalarizedI);
+ }
}
+
I->replaceAllUsesWith(Result);
I->dropAllReferences();
I->eraseFromParent();
@@ -989,10 +982,17 @@ static bool targetSupportsFrem(const TargetLowering &TLI, Type *Ty) {
return TLI.getLibcallName(fremToLibcall(Ty->getScalarType()));
}
+static void addToWorklist(Instruction &I,
+ SmallVector<Instruction *, 4> &Worklist) {
+ if (I.getOperand(0)->getType()->isVectorTy())
+ scalarize(&I, Worklist);
+ else
+ Worklist.push_back(&I);
+}
+
static bool runImpl(Function &F, const TargetLowering &TLI,
AssumptionCache *AC) {
- SmallVector<Instruction *, 4> Replace;
- SmallVector<Instruction *, 4> ReplaceVector;
+ SmallVector<Instruction *, 4> Worklist;
bool Modified = false;
unsigned MaxLegalFpConvertBitWidth =
@@ -1003,55 +1003,39 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
if (MaxLegalFpConvertBitWidth >= llvm::IntegerType::MAX_INT_BITS)
return false;
- for (auto &I : instructions(F)) {
- switch (I.getOpcode()) {
- case Instruction::FRem: {
- Type *Ty = I.getType();
- // TODO: This pass doesn't handle scalable vectors.
- if (Ty->isScalableTy())
- continue;
-
- if (targetSupportsFrem(TLI, Ty) ||
- !FRemExpander::canExpandType(Ty->getScalarType()))
- continue;
-
- Replace.push_back(&I);
- Modified = true;
+ for (auto It = inst_begin(&F), End = inst_end(F); It != End;) {
+ Instruction &I = *It++;
+ Type *Ty = I.getType();
+ // TODO: This pass doesn't handle scalable vectors.
+ if (Ty->isScalableTy())
+ continue;
+ switch (I.getOpcode()) {
+ case Instruction::FRem:
+ if (!targetSupportsFrem(TLI, Ty) &&
+ FRemExpander::canExpandType(Ty->getScalarType())) {
+ addToWorklist(I, Worklist);
+ Modified = true;
+ }
break;
- }
case Instruction::FPToUI:
case Instruction::FPToSI: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
- auto *IntTy = cast<IntegerType>(I.getType()->getScalarType());
+ auto *IntTy = cast<IntegerType>(Ty->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
+ addToWorklist(I, Worklist);
Modified = true;
break;
}
case Instruction::UIToFP:
case Instruction::SIToFP: {
- // TODO: This pass doesn't handle scalable vectors.
- if (I.getOperand(0)->getType()->isScalableTy())
- continue;
-
auto *IntTy =
cast<IntegerType>(I.getOperand(0)->getType()->getScalarType());
if (IntTy->getIntegerBitWidth() <= MaxLegalFpConvertBitWidth)
continue;
- if (I.getOperand(0)->getType()->isVectorTy())
- ReplaceVector.push_back(&I);
- else
- Replace.push_back(&I);
+ addToWorklist(I, Worklist);
Modified = true;
break;
}
@@ -1060,16 +1044,8 @@ static bool runImpl(Function &F, const TargetLowering &TLI,
}
}
- while (!ReplaceVector.empty()) {
- Instruction *I = ReplaceVector.pop_back_val();
- scalarize(I, Replace);
- }
-
- if (Replace.empty())
- return false;
-
- while (!Replace.empty()) {
- Instruction *I = Replace.pop_back_val();
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
if (I->getOpcode() == Instruction::FRem) {
auto SQ = [&]() -> std::optional<SimplifyQuery> {
if (AC) {
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 90c60d4..3812823 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -1975,6 +1975,44 @@ unsigned GISelValueTracking::computeNumSignBits(Register R,
break;
}
+ case TargetOpcode::G_SUB: {
+ Register Src2 = MI.getOperand(2).getReg();
+ unsigned Src2NumSignBits =
+ computeNumSignBits(Src2, DemandedElts, Depth + 1);
+ if (Src2NumSignBits == 1)
+ return 1; // Early out.
+
+ // Handle NEG.
+ Register Src1 = MI.getOperand(1).getReg();
+ KnownBits Known1 = getKnownBits(Src1, DemandedElts, Depth);
+ if (Known1.isZero()) {
+ KnownBits Known2 = getKnownBits(Src2, DemandedElts, Depth);
+ // If the input is known to be 0 or 1, the output is 0/-1, which is all
+ // sign bits set.
+ if ((Known2.Zero | 1).isAllOnes())
+ return TyBits;
+
+ // If the input is known to be positive (the sign bit is known clear),
+ // the output of the NEG has, at worst, the same number of sign bits as
+ // the input.
+ if (Known2.isNonNegative()) {
+ FirstAnswer = Src2NumSignBits;
+ break;
+ }
+
+ // Otherwise, we treat this like a SUB.
+ }
+
+ unsigned Src1NumSignBits =
+ computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (Src1NumSignBits == 1)
+ return 1; // Early Out.
+
+ // Sub can have at most one carry bit. Thus we know that the output
+ // is, at worst, one more bit than the inputs.
+ FirstAnswer = std::min(Src1NumSignBits, Src2NumSignBits) - 1;
+ break;
+ }
case TargetOpcode::G_FCMP:
case TargetOpcode::G_ICMP: {
bool IsFP = Opcode == TargetOpcode::G_FCMP;
diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index 47640c4a..81ab317 100644
--- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -587,16 +587,12 @@ public:
} // namespace
char GlobalMergeFuncPassWrapper::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
- "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
- "Global merge function pass", false, false)
+INITIALIZE_PASS(GlobalMergeFuncPassWrapper, "global-merge-func",
+ "Global merge function pass", false, false)
-namespace llvm {
-ModulePass *createGlobalMergeFuncPass() {
+ModulePass *llvm::createGlobalMergeFuncPass() {
return new GlobalMergeFuncPassWrapper();
}
-} // namespace llvm
GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
initializeGlobalMergeFuncPassWrapperPass(
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 3485a27..0e38017 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -101,15 +101,11 @@ static cl::opt<bool> EnablePrecomputePhysRegs(
static bool EnablePrecomputePhysRegs = false;
#endif // NDEBUG
-namespace llvm {
-
-cl::opt<bool> UseSegmentSetForPhysRegs(
+cl::opt<bool> llvm::UseSegmentSetForPhysRegs(
"use-segment-set-for-physregs", cl::Hidden, cl::init(true),
cl::desc(
"Use segment set for the computation of the live ranges of physregs."));
-} // end namespace llvm
-
void LiveIntervalsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addPreserved<LiveVariablesWrapperPass>();
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index e859765..5c78d98 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -29,20 +29,17 @@ using namespace mir2vec;
STATISTIC(MIRVocabMissCounter,
"Number of lookups to MIR entities not present in the vocabulary");
-namespace llvm {
-namespace mir2vec {
-cl::OptionCategory MIR2VecCategory("MIR2Vec Options");
+cl::OptionCategory llvm::mir2vec::MIR2VecCategory("MIR2Vec Options");
// FIXME: Use a default vocab when not specified
static cl::opt<std::string>
VocabFile("mir2vec-vocab-path", cl::Optional,
cl::desc("Path to the vocabulary file for MIR2Vec"), cl::init(""),
cl::cat(MIR2VecCategory));
-cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
- cl::desc("Weight for machine opcode embeddings"),
- cl::cat(MIR2VecCategory));
-} // namespace mir2vec
-} // namespace llvm
+cl::opt<float>
+ llvm::mir2vec::OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
+ cl::desc("Weight for machine opcode embeddings"),
+ cl::cat(MIR2VecCategory));
//===----------------------------------------------------------------------===//
// Vocabulary Implementation
diff --git a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
index f5146f5..d988a2a 100644
--- a/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
+++ b/llvm/lib/CodeGen/MIRFSDiscriminator.cpp
@@ -40,7 +40,7 @@ cl::opt<bool> ImprovedFSDiscriminator(
"improved-fs-discriminator", cl::Hidden, cl::init(false),
cl::desc("New FS discriminators encoding (incompatible with the original "
"encoding)"));
-}
+} // namespace llvm
char MIRAddFSDiscriminators::ID = 0;
diff --git a/llvm/lib/CodeGen/MIRNamerPass.cpp b/llvm/lib/CodeGen/MIRNamerPass.cpp
index bc65700..cbf8867 100644
--- a/llvm/lib/CodeGen/MIRNamerPass.cpp
+++ b/llvm/lib/CodeGen/MIRNamerPass.cpp
@@ -23,10 +23,6 @@
using namespace llvm;
-namespace llvm {
-extern char &MIRNamerID;
-} // namespace llvm
-
#define DEBUG_TYPE "mir-namer"
namespace {
@@ -53,10 +49,9 @@ public:
VRegRenamer Renamer(MF.getRegInfo());
- unsigned BBIndex = 0;
ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
- for (auto &MBB : RPOT)
- Changed |= Renamer.renameVRegs(MBB, BBIndex++);
+ for (const auto &[BBIndex, MBB] : enumerate(RPOT))
+ Changed |= Renamer.renameVRegs(MBB, BBIndex);
return Changed;
}
@@ -66,10 +61,4 @@ public:
char MIRNamer::ID;
-char &llvm::MIRNamerID = MIRNamer::ID;
-
-INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
- false)
-
-INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
- false)
+INITIALIZE_PASS(MIRNamer, "mir-namer", "Rename Register Operands", false, false)
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index bf8a6cd..1d54d72 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -107,10 +107,8 @@ struct MFPrintState {
} // end anonymous namespace
-namespace llvm::yaml {
-
/// This struct serializes the LLVM IR module.
-template <> struct BlockScalarTraits<Module> {
+template <> struct yaml::BlockScalarTraits<Module> {
static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
Mod.print(OS, nullptr);
}
@@ -121,8 +119,6 @@ template <> struct BlockScalarTraits<Module> {
}
};
-} // end namespace llvm::yaml
-
static void printRegMIR(Register Reg, yaml::StringValue &Dest,
const TargetRegisterInfo *TRI) {
raw_string_ostream OS(Dest.Value);
@@ -866,48 +862,46 @@ static void printMI(raw_ostream &OS, MFPrintState &State,
OS << TII->getName(MI.getOpcode());
- LS = ListSeparator();
+ // Print a space after the opcode if any additional tokens are printed.
+ LS = ListSeparator(", ", " ");
- if (I < E) {
- OS << ' ';
- for (; I < E; ++I) {
- OS << LS;
- printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
- PrintedTypes, MRI, /*PrintDef=*/true);
- }
+ for (; I < E; ++I) {
+ OS << LS;
+ printMIOperand(OS, State, MI, I, TRI, TII, ShouldPrintRegisterTies,
+ PrintedTypes, MRI, /*PrintDef=*/true);
}
// Print any optional symbols attached to this instruction as-if they were
// operands.
if (MCSymbol *PreInstrSymbol = MI.getPreInstrSymbol()) {
- OS << LS << " pre-instr-symbol ";
+ OS << LS << "pre-instr-symbol ";
MachineOperand::printSymbol(OS, *PreInstrSymbol);
}
if (MCSymbol *PostInstrSymbol = MI.getPostInstrSymbol()) {
- OS << LS << " post-instr-symbol ";
+ OS << LS << "post-instr-symbol ";
MachineOperand::printSymbol(OS, *PostInstrSymbol);
}
if (MDNode *HeapAllocMarker = MI.getHeapAllocMarker()) {
- OS << LS << " heap-alloc-marker ";
+ OS << LS << "heap-alloc-marker ";
HeapAllocMarker->printAsOperand(OS, State.MST);
}
if (MDNode *PCSections = MI.getPCSections()) {
- OS << LS << " pcsections ";
+ OS << LS << "pcsections ";
PCSections->printAsOperand(OS, State.MST);
}
if (MDNode *MMRA = MI.getMMRAMetadata()) {
- OS << LS << " mmra ";
+ OS << LS << "mmra ";
MMRA->printAsOperand(OS, State.MST);
}
if (uint32_t CFIType = MI.getCFIType())
- OS << LS << " cfi-type " << CFIType;
+ OS << LS << "cfi-type " << CFIType;
if (auto Num = MI.peekDebugInstrNum())
- OS << LS << " debug-instr-number " << Num;
+ OS << LS << "debug-instr-number " << Num;
if (PrintLocations) {
if (const DebugLoc &DL = MI.getDebugLoc()) {
- OS << LS << " debug-location ";
+ OS << LS << "debug-location ";
DL->printAsOperand(OS, State.MST);
}
}
diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index b2731b69..a72c2c4 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -97,7 +97,9 @@ static const bool EnableDevelopmentFeatures = false;
/// this happens only in development mode. It's a no-op otherwise.
namespace llvm {
extern cl::opt<unsigned> EvictInterferenceCutoff;
+} // namespace llvm
+namespace {
class RegAllocScoring : public MachineFunctionPass {
public:
static char ID;
@@ -124,11 +126,12 @@ public:
/// Performs this pass
bool runOnMachineFunction(MachineFunction &) override;
};
+} // namespace
char RegAllocScoring::ID = 0;
-FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
-
-} // namespace llvm
+FunctionPass *llvm::createRegAllocScoringPass() {
+ return new RegAllocScoring();
+}
INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
"Register Allocation Scoring Pass", false, false)
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index e7fa082..26eb10f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -29,7 +29,6 @@ using namespace llvm;
#define DEBUG_TYPE "machine-block-freq"
-namespace llvm {
static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
"view-machine-block-freq-propagation-dags", cl::Hidden,
cl::desc("Pop up a window to show a dag displaying how machine block "
@@ -44,6 +43,7 @@ static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
clEnumValN(GVDT_Count, "count", "display a graph using the real "
"profile count if available.")));
+namespace llvm {
// Similar option above, but used to control BFI display only after MBP pass
cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
"view-block-layout-with-bfi", cl::Hidden,
@@ -69,15 +69,15 @@ extern cl::opt<std::string> ViewBlockFreqFuncName;
// Defined in Analysis/BlockFrequencyInfo.cpp: -view-hot-freq-perc=
extern cl::opt<unsigned> ViewHotFreqPercent;
-static cl::opt<bool> PrintMachineBlockFreq(
- "print-machine-bfi", cl::init(false), cl::Hidden,
- cl::desc("Print the machine block frequency info."));
-
// Command line option to specify the name of the function for block frequency
// dump. Defined in Analysis/BlockFrequencyInfo.cpp.
extern cl::opt<std::string> PrintBFIFuncName;
} // namespace llvm
+static cl::opt<bool>
+ PrintMachineBlockFreq("print-machine-bfi", cl::init(false), cl::Hidden,
+ cl::desc("Print the machine block frequency info."));
+
static GVDAGType getGVDT() {
if (ViewBlockLayoutWithBFI != GVDT_None)
return ViewBlockLayoutWithBFI;
@@ -85,9 +85,7 @@ static GVDAGType getGVDT() {
return ViewMachineBlockFreqPropagationDAG;
}
-namespace llvm {
-
-template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+template <> struct llvm::GraphTraits<MachineBlockFrequencyInfo *> {
using NodeRef = const MachineBasicBlock *;
using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;
@@ -116,7 +114,7 @@ using MBFIDOTGraphTraitsBase =
MachineBranchProbabilityInfo>;
template <>
-struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+struct llvm::DOTGraphTraits<MachineBlockFrequencyInfo *>
: public MBFIDOTGraphTraitsBase {
const MachineFunction *CurFunc = nullptr;
DenseMap<const MachineBasicBlock *, int> LayoutOrderMap;
@@ -159,8 +157,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
}
};
-} // end namespace llvm
-
AnalysisKey MachineBlockFrequencyAnalysis::Key;
MachineBlockFrequencyAnalysis::Result
diff --git a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 2e92dd8..7ca4582 100644
--- a/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -18,13 +18,8 @@
using namespace llvm;
-INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfoWrapperPass,
- "machine-branch-prob",
- "Machine Branch Probability Analysis", false, true)
-INITIALIZE_PASS_END(MachineBranchProbabilityInfoWrapperPass,
- "machine-branch-prob",
- "Machine Branch Probability Analysis", false, true)
-
+INITIALIZE_PASS(MachineBranchProbabilityInfoWrapperPass, "machine-branch-prob",
+ "Machine Branch Probability Analysis", false, true)
namespace llvm {
cl::opt<unsigned>
StaticLikelyProb("static-likely-prob",
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 224231c..bfa5ab2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -719,43 +719,41 @@ MachineFunction::CallSiteInfo::CallSiteInfo(const CallBase &CB) {
}
}
-namespace llvm {
+template <>
+struct llvm::DOTGraphTraits<const MachineFunction *>
+ : public DefaultDOTGraphTraits {
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
- template<>
- struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+ static std::string getGraphName(const MachineFunction *F) {
+ return ("CFG for '" + F->getName() + "' function").str();
+ }
- static std::string getGraphName(const MachineFunction *F) {
- return ("CFG for '" + F->getName() + "' function").str();
+ std::string getNodeLabel(const MachineBasicBlock *Node,
+ const MachineFunction *Graph) {
+ std::string OutStr;
+ {
+ raw_string_ostream OSS(OutStr);
+
+ if (isSimple()) {
+ OSS << printMBBReference(*Node);
+ if (const BasicBlock *BB = Node->getBasicBlock())
+ OSS << ": " << BB->getName();
+ } else
+ Node->print(OSS);
}
- std::string getNodeLabel(const MachineBasicBlock *Node,
- const MachineFunction *Graph) {
- std::string OutStr;
- {
- raw_string_ostream OSS(OutStr);
-
- if (isSimple()) {
- OSS << printMBBReference(*Node);
- if (const BasicBlock *BB = Node->getBasicBlock())
- OSS << ": " << BB->getName();
- } else
- Node->print(OSS);
- }
-
- if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
-
- // Process string output to make it nicer...
- for (unsigned i = 0; i != OutStr.length(); ++i)
- if (OutStr[i] == '\n') { // Left justify
- OutStr[i] = '\\';
- OutStr.insert(OutStr.begin()+i+1, 'l');
- }
- return OutStr;
- }
- };
+ if (OutStr[0] == '\n')
+ OutStr.erase(OutStr.begin());
-} // end namespace llvm
+ // Process string output to make it nicer...
+ for (unsigned i = 0; i != OutStr.length(); ++i)
+ if (OutStr[i] == '\n') { // Left justify
+ OutStr[i] = '\\';
+ OutStr.insert(OutStr.begin() + i + 1, 'l');
+ }
+ return OutStr;
+ }
+};
void MachineFunction::viewCFG() const
{
diff --git a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0f88a7b..5111322 100644
--- a/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -60,13 +60,11 @@ char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
"Machine Function Printer", false, false)
-namespace llvm {
/// Returns a newly-created MachineFunction Printer pass. The
/// default banner is empty.
///
-MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
- const std::string &Banner){
+MachineFunctionPass *
+llvm::createMachineFunctionPrinterPass(raw_ostream &OS,
+ const std::string &Banner) {
return new MachineFunctionPrinterPass(OS, Banner);
}
-
-}
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index fdae3b4..9feb974 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -593,15 +593,12 @@ struct MachineOutliner : public ModulePass {
char MachineOutliner::ID = 0;
-namespace llvm {
-ModulePass *createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
+ModulePass *llvm::createMachineOutlinerPass(RunOutliner RunOutlinerMode) {
MachineOutliner *OL = new MachineOutliner();
OL->RunOutlinerMode = RunOutlinerMode;
return OL;
}
-} // namespace llvm
-
INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE, "Machine Function Outliner", false,
false)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 89ed4da..a717d9e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -201,16 +201,15 @@ static cl::opt<unsigned> SwpMaxNumStores(
cl::desc("Maximum number of stores allwed in the target loop."), cl::Hidden,
cl::init(200));
-namespace llvm {
-
// A command line option to enable the CopyToPhi DAG mutation.
-cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
- cl::init(true),
- cl::desc("Enable CopyToPhi DAG Mutation"));
+cl::opt<bool>
+ llvm::SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
+ cl::init(true),
+ cl::desc("Enable CopyToPhi DAG Mutation"));
/// A command line argument to force pipeliner to use specified issue
/// width.
-cl::opt<int> SwpForceIssueWidth(
+cl::opt<int> llvm::SwpForceIssueWidth(
"pipeliner-force-issue-width",
cl::desc("Force pipeliner to use specified issue width."), cl::Hidden,
cl::init(-1));
@@ -226,8 +225,6 @@ static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
clEnumValN(WindowSchedulingFlag::WS_Force, "force",
"Use window algorithm instead of SMS algorithm.")));
-} // end namespace llvm
-
unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
char MachinePipeliner::ID = 0;
#ifndef NDEBUG
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 299bcc4..3ed1045 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -176,9 +176,7 @@ STATISTIC(NumNodeOrderPostRA,
STATISTIC(NumFirstValidPostRA,
"Number of scheduling units chosen for FirstValid heuristic post-RA");
-namespace llvm {
-
-cl::opt<MISched::Direction> PreRADirection(
+cl::opt<MISched::Direction> llvm::PreRADirection(
"misched-prera-direction", cl::Hidden,
cl::desc("Pre reg-alloc list scheduling direction"),
cl::init(MISched::Unspecified),
@@ -206,33 +204,31 @@ static cl::opt<bool>
DumpCriticalPathLength("misched-dcpl", cl::Hidden,
cl::desc("Print critical path length to stdout"));
-cl::opt<bool> VerifyScheduling(
+cl::opt<bool> llvm::VerifyScheduling(
"verify-misched", cl::Hidden,
cl::desc("Verify machine instrs before and after machine scheduling"));
#ifndef NDEBUG
-cl::opt<bool> ViewMISchedDAGs(
+cl::opt<bool> llvm::ViewMISchedDAGs(
"view-misched-dags", cl::Hidden,
cl::desc("Pop up a window to show MISched dags after they are processed"));
-cl::opt<bool> PrintDAGs("misched-print-dags", cl::Hidden,
- cl::desc("Print schedule DAGs"));
-cl::opt<bool> MISchedDumpReservedCycles(
+cl::opt<bool> llvm::PrintDAGs("misched-print-dags", cl::Hidden,
+ cl::desc("Print schedule DAGs"));
+static cl::opt<bool> MISchedDumpReservedCycles(
"misched-dump-reserved-cycles", cl::Hidden, cl::init(false),
cl::desc("Dump resource usage at schedule boundary."));
-cl::opt<bool> MischedDetailResourceBooking(
+static cl::opt<bool> MischedDetailResourceBooking(
"misched-detail-resource-booking", cl::Hidden, cl::init(false),
cl::desc("Show details of invoking getNextResoufceCycle."));
#else
-const bool ViewMISchedDAGs = false;
-const bool PrintDAGs = false;
-const bool MischedDetailResourceBooking = false;
+const bool llvm::ViewMISchedDAGs = false;
+const bool llvm::PrintDAGs = false;
+static const bool MischedDetailResourceBooking = false;
#ifdef LLVM_ENABLE_DUMP
-const bool MISchedDumpReservedCycles = false;
+static const bool MISchedDumpReservedCycles = false;
#endif // LLVM_ENABLE_DUMP
#endif // NDEBUG
-} // end namespace llvm
-
#ifndef NDEBUG
/// In some situations a few uninteresting nodes depend on nearly all other
/// nodes in the graph, provide a cutoff to hide them.
@@ -2053,28 +2049,24 @@ public:
} // end anonymous namespace
-namespace llvm {
-
std::unique_ptr<ScheduleDAGMutation>
-createLoadClusterDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI,
- bool ReorderWhileClustering) {
+llvm::createLoadClusterDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}
std::unique_ptr<ScheduleDAGMutation>
-createStoreClusterDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI,
- bool ReorderWhileClustering) {
+llvm::createStoreClusterDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ bool ReorderWhileClustering) {
return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(
TII, TRI, ReorderWhileClustering)
: nullptr;
}
-} // end namespace llvm
-
// Sorting all the loads/stores first, then for each load/store, checking the
// following load/store one by one, until reach the first non-dependent one and
// call target hook to see if they can cluster.
@@ -2304,16 +2296,12 @@ protected:
} // end anonymous namespace
-namespace llvm {
-
std::unique_ptr<ScheduleDAGMutation>
-createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
- const TargetRegisterInfo *TRI) {
+llvm::createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) {
return std::make_unique<CopyConstrain>(TII, TRI);
}
-} // end namespace llvm
-
/// constrainLocalCopy handles two possibilities:
/// 1) Local src:
/// I0: = dst
@@ -3445,14 +3433,13 @@ void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
}
#endif
-namespace llvm {
/// Return true if this heuristic determines order.
/// TODO: Consider refactor return type of these functions as integer or enum,
/// as we may need to differentiate whether TryCand is better than Cand.
-bool tryLess(int TryVal, int CandVal,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryLess(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
if (TryVal < CandVal) {
TryCand.Reason = Reason;
return true;
@@ -3465,10 +3452,10 @@ bool tryLess(int TryVal, int CandVal,
return false;
}
-bool tryGreater(int TryVal, int CandVal,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason) {
+bool llvm::tryGreater(int TryVal, int CandVal,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason) {
if (TryVal > CandVal) {
TryCand.Reason = Reason;
return true;
@@ -3481,9 +3468,9 @@ bool tryGreater(int TryVal, int CandVal,
return false;
}
-bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- SchedBoundary &Zone) {
+bool llvm::tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ SchedBoundary &Zone) {
if (Zone.isTop()) {
// Prefer the candidate with the lesser depth, but only if one of them has
// depth greater than the total latency scheduled so far, otherwise either
@@ -3513,7 +3500,6 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
}
return false;
}
-} // end namespace llvm
static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop,
bool IsPostRA = false) {
@@ -3798,14 +3784,12 @@ void GenericScheduler::registerRoots() {
}
}
-namespace llvm {
-bool tryPressure(const PressureChange &TryP,
- const PressureChange &CandP,
- GenericSchedulerBase::SchedCandidate &TryCand,
- GenericSchedulerBase::SchedCandidate &Cand,
- GenericSchedulerBase::CandReason Reason,
- const TargetRegisterInfo *TRI,
- const MachineFunction &MF) {
+bool llvm::tryPressure(const PressureChange &TryP, const PressureChange &CandP,
+ GenericSchedulerBase::SchedCandidate &TryCand,
+ GenericSchedulerBase::SchedCandidate &Cand,
+ GenericSchedulerBase::CandReason Reason,
+ const TargetRegisterInfo *TRI,
+ const MachineFunction &MF) {
// If one candidate decreases and the other increases, go with it.
// Invalid candidates have UnitInc==0.
if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
@@ -3838,7 +3822,7 @@ bool tryPressure(const PressureChange &TryP,
return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
}
-unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+unsigned llvm::getWeakLeft(const SUnit *SU, bool isTop) {
return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
}
@@ -3849,7 +3833,7 @@ unsigned getWeakLeft(const SUnit *SU, bool isTop) {
/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
/// with the operation that produces or consumes the physreg. We'll do this when
/// regalloc has support for parallel copies.
-int biasPhysReg(const SUnit *SU, bool isTop) {
+int llvm::biasPhysReg(const SUnit *SU, bool isTop) {
const MachineInstr *MI = SU->getInstr();
if (MI->isCopy()) {
@@ -3884,7 +3868,6 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
return 0;
}
-} // end namespace llvm
void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop,
@@ -4812,13 +4795,13 @@ static MachineSchedRegistry ShufflerRegistry(
//===----------------------------------------------------------------------===//
#ifndef NDEBUG
-namespace llvm {
-template<> struct GraphTraits<
- ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+template <>
+struct llvm::GraphTraits<ScheduleDAGMI *> : public GraphTraits<ScheduleDAG *> {
+};
-template<>
-struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAGMI *> : public DefaultDOTGraphTraits {
DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
static std::string getGraphName(const ScheduleDAG *G) {
@@ -4878,7 +4861,6 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
}
};
-} // end namespace llvm
#endif // NDEBUG
/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index c2d4aa0..9ac3f741 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -485,10 +485,7 @@ struct LoopBounds {
// Specialize po_iterator_storage in order to prune the post-order traversal so
// it is limited to the current loop and doesn't traverse the loop back edges.
-namespace llvm {
-
-template<>
-class po_iterator_storage<LoopBounds, true> {
+template <> class llvm::po_iterator_storage<LoopBounds, true> {
LoopBounds &LB;
public:
@@ -519,8 +516,6 @@ public:
}
};
-} // end namespace llvm
-
/// Compute the trace through MBB.
void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
LLVM_DEBUG(dbgs() << "Computing " << getName() << " trace through "
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 087ac62..59c587c 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -9,7 +9,7 @@
#include "llvm/CodeGen/NonRelocatableStringpool.h"
#include "llvm/ADT/STLExtras.h"
-namespace llvm {
+using namespace llvm;
DwarfStringPoolEntryRef NonRelocatableStringpool::getEntry(StringRef S) {
auto I = Strings.try_emplace(S);
@@ -43,5 +43,3 @@ NonRelocatableStringpool::getEntriesForEmission() const {
});
return Result;
}
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 6f373a5..e9ffa85 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -76,8 +76,6 @@ using namespace llvm::safestack;
#define DEBUG_TYPE "safe-stack"
-namespace llvm {
-
STATISTIC(NumFunctions, "Total number of functions");
STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
STATISTIC(NumUnsafeStackRestorePointsFunctions,
@@ -89,8 +87,6 @@ STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
-} // namespace llvm
-
/// Use __safestack_pointer_address even if the platform has a faster way of
/// access safe stack pointer.
static cl::opt<bool>
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index eae2e8c..3268c26 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1551,14 +1551,10 @@ LLVM_DUMP_METHOD void ILPValue::dump() const {
dbgs() << *this << '\n';
}
-namespace llvm {
-
LLVM_ATTRIBUTE_UNUSED
-raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+raw_ostream &llvm::operator<<(raw_ostream &OS, const ILPValue &Val) {
Val.print(OS);
return OS;
}
-} // end namespace llvm
-
#endif
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index e7b1494..c80eade 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -16,57 +16,51 @@
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
-namespace llvm {
- template<>
- struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+template <>
+struct llvm::DOTGraphTraits<ScheduleDAG *> : public DefaultDOTGraphTraits {
- DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
- static std::string getGraphName(const ScheduleDAG *G) {
- return std::string(G->MF.getName());
- }
+ static std::string getGraphName(const ScheduleDAG *G) {
+ return std::string(G->MF.getName());
+ }
- static bool renderGraphFromBottomUp() {
- return true;
- }
+ static bool renderGraphFromBottomUp() { return true; }
- static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
- return (Node->NumPreds > 10 || Node->NumSuccs > 10);
- }
+ static bool isNodeHidden(const SUnit *Node, const ScheduleDAG *G) {
+ return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+ }
- static std::string getNodeIdentifierLabel(const SUnit *Node,
- const ScheduleDAG *Graph) {
- std::string R;
- raw_string_ostream OS(R);
- OS << static_cast<const void *>(Node);
- return R;
- }
+ static std::string getNodeIdentifierLabel(const SUnit *Node,
+ const ScheduleDAG *Graph) {
+ std::string R;
+ raw_string_ostream OS(R);
+ OS << static_cast<const void *>(Node);
+ return R;
+ }
- /// If you want to override the dot attributes printed for a particular
- /// edge, override this method.
- static std::string getEdgeAttributes(const SUnit *Node,
- SUnitIterator EI,
- const ScheduleDAG *Graph) {
- if (EI.isArtificialDep())
- return "color=cyan,style=dashed";
- if (EI.isCtrlDep())
- return "color=blue,style=dashed";
- return "";
- }
+ /// If you want to override the dot attributes printed for a particular
+ /// edge, override this method.
+ static std::string getEdgeAttributes(const SUnit *Node, SUnitIterator EI,
+ const ScheduleDAG *Graph) {
+ if (EI.isArtificialDep())
+ return "color=cyan,style=dashed";
+ if (EI.isCtrlDep())
+ return "color=blue,style=dashed";
+ return "";
+ }
+ std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
+ static std::string getNodeAttributes(const SUnit *N,
+ const ScheduleDAG *Graph) {
+ return "shape=Mrecord";
+ }
- std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *Graph);
- static std::string getNodeAttributes(const SUnit *N,
- const ScheduleDAG *Graph) {
- return "shape=Mrecord";
- }
-
- static void addCustomGraphFeatures(ScheduleDAG *G,
- GraphWriter<ScheduleDAG*> &GW) {
- return G->addCustomGraphFeatures(GW);
- }
- };
-}
+ static void addCustomGraphFeatures(ScheduleDAG *G,
+ GraphWriter<ScheduleDAG *> &GW) {
+ return G->addCustomGraphFeatures(GW);
+ }
+};
std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
const ScheduleDAG *G) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b1accdd..e153842 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -509,6 +509,7 @@ namespace {
SDValue visitFMUL(SDNode *N);
template <class MatchContextClass> SDValue visitFMA(SDNode *N);
SDValue visitFMAD(SDNode *N);
+ SDValue visitFMULADD(SDNode *N);
SDValue visitFDIV(SDNode *N);
SDValue visitFREM(SDNode *N);
SDValue visitFSQRT(SDNode *N);
@@ -1991,6 +1992,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::FMUL: return visitFMUL(N);
case ISD::FMA: return visitFMA<EmptyMatchContext>(N);
case ISD::FMAD: return visitFMAD(N);
+ case ISD::FMULADD: return visitFMULADD(N);
case ISD::FDIV: return visitFDIV(N);
case ISD::FREM: return visitFREM(N);
case ISD::FSQRT: return visitFSQRT(N);
@@ -18444,6 +18446,21 @@ SDValue DAGCombiner::visitFMAD(SDNode *N) {
return SDValue();
}
+SDValue DAGCombiner::visitFMULADD(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ // Constant fold FMULADD.
+ if (SDValue C =
+ DAG.FoldConstantArithmetic(ISD::FMULADD, DL, VT, {N0, N1, N2}))
+ return C;
+
+ return SDValue();
+}
+
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal.
// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 08af74c..c9aeef7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5786,6 +5786,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts,
case ISD::FCOPYSIGN:
case ISD::FMA:
case ISD::FMAD:
+ case ISD::FMULADD:
case ISD::FP_EXTEND:
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
@@ -5904,6 +5905,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, const APInt &DemandedElts,
case ISD::FCOSH:
case ISD::FTANH:
case ISD::FMA:
+ case ISD::FMULADD:
case ISD::FMAD: {
if (SNaN)
return true;
@@ -7231,7 +7233,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
}
// Handle fma/fmad special cases.
- if (Opcode == ISD::FMA || Opcode == ISD::FMAD) {
+ if (Opcode == ISD::FMA || Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
assert(Ops[0].getValueType() == VT && Ops[1].getValueType() == VT &&
Ops[2].getValueType() == VT && "FMA types must match!");
@@ -7242,7 +7244,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
APFloat V1 = C1->getValueAPF();
const APFloat &V2 = C2->getValueAPF();
const APFloat &V3 = C3->getValueAPF();
- if (Opcode == ISD::FMAD) {
+ if (Opcode == ISD::FMAD || Opcode == ISD::FMULADD) {
V1.multiply(V2, APFloat::rmNearestTiesToEven);
V1.add(V3, APFloat::rmNearestTiesToEven);
} else
@@ -8781,7 +8783,7 @@ static SDValue getMemcpyLoadsAndStores(
if (Value.getNode()) {
Store = DAG.getStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
}
@@ -8797,7 +8799,7 @@ static SDValue getMemcpyLoadsAndStores(
assert(NVT.bitsGE(VT));
bool isDereferenceable =
- SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+ SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
@@ -8806,14 +8808,14 @@ static SDValue getMemcpyLoadsAndStores(
Value = DAG.getExtLoad(
ISD::EXTLOAD, dl, NVT, Chain,
- DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), VT,
commonAlignment(*SrcAlign, SrcOff), SrcMMOFlags, NewAAInfo);
OutLoadChains.push_back(Value.getValue(1));
Store = DAG.getTruncStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags, NewAAInfo);
OutStoreChains.push_back(Store);
}
@@ -8943,14 +8945,14 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Value;
bool isDereferenceable =
- SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
+ SrcPtrInfo.getWithOffset(SrcOff).isDereferenceable(VTSize, C, DL);
MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
if (isDereferenceable)
SrcMMOFlags |= MachineMemOperand::MODereferenceable;
Value = DAG.getLoad(
VT, dl, Chain,
- DAG.getMemBasePlusOffset(Src, TypeSize::getFixed(SrcOff), dl),
+ DAG.getObjectPtrOffset(dl, Src, TypeSize::getFixed(SrcOff)),
SrcPtrInfo.getWithOffset(SrcOff), *SrcAlign, SrcMMOFlags, NewAAInfo);
LoadValues.push_back(Value);
LoadChains.push_back(Value.getValue(1));
@@ -8965,7 +8967,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
Store = DAG.getStore(
Chain, dl, LoadValues[i],
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags, NewAAInfo);
OutChains.push_back(Store);
DstOff += VTSize;
@@ -9097,7 +9099,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
assert(Value.getValueType() == VT && "Value with wrong type.");
SDValue Store = DAG.getStore(
Chain, dl, Value,
- DAG.getMemBasePlusOffset(Dst, TypeSize::getFixed(DstOff), dl),
+ DAG.getObjectPtrOffset(dl, Dst, TypeSize::getFixed(DstOff)),
DstPtrInfo.getWithOffset(DstOff), Alignment,
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone,
NewAAInfo);
@@ -11844,25 +11846,38 @@ SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
/// getNodeIfExists - Get the specified node if it's already available, or
/// else return NULL.
SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
- ArrayRef<SDValue> Ops) {
+ ArrayRef<SDValue> Ops,
+ bool AllowCommute) {
SDNodeFlags Flags;
if (Inserter)
Flags = Inserter->getFlags();
- return getNodeIfExists(Opcode, VTList, Ops, Flags);
+ return getNodeIfExists(Opcode, VTList, Ops, Flags, AllowCommute);
}
SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
ArrayRef<SDValue> Ops,
- const SDNodeFlags Flags) {
- if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+ const SDNodeFlags Flags,
+ bool AllowCommute) {
+ if (VTList.VTs[VTList.NumVTs - 1] == MVT::Glue)
+ return nullptr;
+
+ auto Lookup = [&](ArrayRef<SDValue> LookupOps) -> SDNode * {
FoldingSetNodeID ID;
- AddNodeIDNode(ID, Opcode, VTList, Ops);
+ AddNodeIDNode(ID, Opcode, VTList, LookupOps);
void *IP = nullptr;
- if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+ if (SDNode *E = FindNodeOrInsertPos(ID, IP)) {
E->intersectFlagsWith(Flags);
return E;
}
- }
+ return nullptr;
+ };
+
+ if (SDNode *Existing = Lookup(Ops))
+ return Existing;
+
+ if (AllowCommute && TLI->isCommutativeBinOp(Opcode))
+ return Lookup({Ops[1], Ops[0]});
+
return nullptr;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c21890a..0f2b518 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6996,6 +6996,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
getValue(I.getArgOperand(0)),
getValue(I.getArgOperand(1)),
getValue(I.getArgOperand(2)), Flags));
+ } else if (TLI.isOperationLegalOrCustom(ISD::FMULADD, VT)) {
+ // TODO: Support splitting the vector.
+ setValue(&I, DAG.getNode(ISD::FMULADD, sdl,
+ getValue(I.getArgOperand(0)).getValueType(),
+ getValue(I.getArgOperand(0)),
+ getValue(I.getArgOperand(1)),
+ getValue(I.getArgOperand(2)), Flags));
} else {
// TODO: Intrinsic calls should have fast-math-flags.
SDValue Mul = DAG.getNode(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index fcfbfe6..39cbfad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -310,6 +310,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::FMA: return "fma";
case ISD::STRICT_FMA: return "strict_fma";
case ISD::FMAD: return "fmad";
+ case ISD::FMULADD: return "fmuladd";
case ISD::FREM: return "frem";
case ISD::STRICT_FREM: return "strict_frem";
case ISD::FCOPYSIGN: return "fcopysign";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc503d3..920dff9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -7676,6 +7676,7 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
break;
}
case ISD::FMA:
+ case ISD::FMULADD:
case ISD::FMAD: {
if (!Flags.hasNoSignedZeros())
break;
diff --git a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
index 64e5cd5..95a9c3f 100644
--- a/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
+++ b/llvm/lib/CodeGen/StackFrameLayoutAnalysisPass.cpp
@@ -306,10 +306,7 @@ char &llvm::StackFrameLayoutAnalysisPassID = StackFrameLayoutAnalysisLegacy::ID;
INITIALIZE_PASS(StackFrameLayoutAnalysisLegacy, "stack-frame-layout",
"Stack Frame Layout", false, false)
-namespace llvm {
/// Returns a newly-created StackFrameLayout pass.
-MachineFunctionPass *createStackFrameLayoutAnalysisPass() {
+MachineFunctionPass *llvm::createStackFrameLayoutAnalysisPass() {
return new StackFrameLayoutAnalysisLegacy();
}
-
-} // namespace llvm
diff --git a/llvm/lib/CodeGen/StaticDataAnnotator.cpp b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
index 53a9ab4..eac20120 100644
--- a/llvm/lib/CodeGen/StaticDataAnnotator.cpp
+++ b/llvm/lib/CodeGen/StaticDataAnnotator.cpp
@@ -75,22 +75,11 @@ bool StaticDataAnnotator::runOnModule(Module &M) {
bool Changed = false;
for (auto &GV : M.globals()) {
- if (GV.isDeclarationForLinker())
+ if (!llvm::memprof::IsAnnotationOK(GV))
continue;
- // The implementation below assumes prior passes don't set section prefixes,
- // and specifically do 'assign' rather than 'update'. So report error if a
- // section prefix is already set.
- if (auto maybeSectionPrefix = GV.getSectionPrefix();
- maybeSectionPrefix && !maybeSectionPrefix->empty())
- llvm::report_fatal_error("Global variable " + GV.getName() +
- " already has a section prefix " +
- *maybeSectionPrefix);
-
StringRef SectionPrefix = SDPI->getConstantSectionPrefix(&GV, PSI);
- if (SectionPrefix.empty())
- continue;
-
+ // setSectionPrefix returns true if the section prefix is updated.
Changed |= GV.setSectionPrefix(SectionPrefix);
}
diff --git a/llvm/lib/CodeGen/StaticDataSplitter.cpp b/llvm/lib/CodeGen/StaticDataSplitter.cpp
index e22dc25..1593a40 100644
--- a/llvm/lib/CodeGen/StaticDataSplitter.cpp
+++ b/llvm/lib/CodeGen/StaticDataSplitter.cpp
@@ -130,10 +130,8 @@ StaticDataSplitter::getConstant(const MachineOperand &Op,
if (Op.isGlobal()) {
// Find global variables with local linkage.
const GlobalVariable *GV = getLocalLinkageGlobalVariable(Op.getGlobal());
- // Skip 'llvm.'-prefixed global variables conservatively because they are
- // often handled specially, and skip those not in static data
- // sections.
- if (!GV || GV->getName().starts_with("llvm.") ||
+ // Skip those not eligible for annotation or not in static data sections.
+ if (!GV || !llvm::memprof::IsAnnotationOK(*GV) ||
!inStaticDataSection(*GV, TM))
return nullptr;
return GV;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index c23281a..060b1dd 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -815,7 +815,8 @@ void TargetLoweringBase::initActions() {
ISD::FTAN, ISD::FACOS,
ISD::FASIN, ISD::FATAN,
ISD::FCOSH, ISD::FSINH,
- ISD::FTANH, ISD::FATAN2},
+ ISD::FTANH, ISD::FATAN2,
+ ISD::FMULADD},
VT, Expand);
// Overflow operations default to expand
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index c9e4618..971f822 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -102,10 +102,8 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
return true;
}
-namespace llvm {
-
-Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
- unsigned SubIdx, const MachineRegisterInfo *MRI) {
+Printable llvm::printReg(Register Reg, const TargetRegisterInfo *TRI,
+ unsigned SubIdx, const MachineRegisterInfo *MRI) {
return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
if (!Reg)
OS << "$noreg";
@@ -135,7 +133,7 @@ Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
});
}
-Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
return Printable([Unit, TRI](raw_ostream &OS) {
// Generic printout when TRI is missing.
if (!TRI) {
@@ -158,7 +156,7 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
});
}
-Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+Printable llvm::printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
return Printable([Unit, TRI](raw_ostream &OS) {
if (Register::isVirtualRegister(Unit)) {
OS << '%' << Register(Unit).virtRegIndex();
@@ -168,8 +166,9 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
});
}
-Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
- const TargetRegisterInfo *TRI) {
+Printable llvm::printRegClassOrBank(Register Reg,
+ const MachineRegisterInfo &RegInfo,
+ const TargetRegisterInfo *TRI) {
return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
if (RegInfo.getRegClassOrNull(Reg))
OS << StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
@@ -183,8 +182,6 @@ Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
});
}
-} // end namespace llvm
-
/// getAllocatableClass - Return the maximal subclass of the given register
/// class that is alloctable, or NULL.
const TargetRegisterClass *
diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
index 33734b8..bb8d2cb 100644
--- a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
@@ -90,7 +90,7 @@ void MapperJITLinkMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
auto TotalSize = Seg.ContentSize + Seg.ZeroFillSize;
Seg.Addr = NextSegAddr;
- Seg.WorkingMem = Mapper->prepare(NextSegAddr, TotalSize);
+ Seg.WorkingMem = Mapper->prepare(G, NextSegAddr, TotalSize);
NextSegAddr += alignTo(TotalSize, Mapper->getPageSize());
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index ea3b22a..7b327af 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -58,7 +58,8 @@ void InProcessMemoryMapper::reserve(size_t NumBytes,
ExecutorAddrRange(ExecutorAddr::fromPtr(MB.base()), MB.allocatedSize()));
}
-char *InProcessMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *InProcessMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) {
return Addr.toPtr<char *>();
}
@@ -324,7 +325,8 @@ void SharedMemoryMapper::reserve(size_t NumBytes,
#endif
}
-char *SharedMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+char *SharedMemoryMapper::prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) {
auto R = Reservations.upper_bound(Addr);
assert(R != Reservations.begin() && "Attempt to prepare unreserved range");
R--;
diff --git a/llvm/lib/IR/ConstantFPRange.cpp b/llvm/lib/IR/ConstantFPRange.cpp
index 51d2e21..5b87686 100644
--- a/llvm/lib/IR/ConstantFPRange.cpp
+++ b/llvm/lib/IR/ConstantFPRange.cpp
@@ -8,6 +8,7 @@
#include "llvm/IR/ConstantFPRange.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
@@ -506,3 +507,168 @@ ConstantFPRange ConstantFPRange::sub(const ConstantFPRange &Other) const {
// fsub X, Y = fadd X, (fneg Y)
return add(Other.negate());
}
+
+void ConstantFPRange::flushDenormals(DenormalMode::DenormalModeKind Mode) {
+ if (Mode == DenormalMode::IEEE)
+ return;
+ FPClassTest Class = classify();
+ if (!(Class & fcSubnormal))
+ return;
+
+ auto &Sem = getSemantics();
+ // PreserveSign: PosSubnormal -> PosZero, NegSubnormal -> NegZero
+ // PositiveZero: PosSubnormal -> PosZero, NegSubnormal -> PosZero
+ // Dynamic: PosSubnormal -> PosZero, NegSubnormal -> NegZero/PosZero
+ bool ZeroLowerNegative =
+ Mode != DenormalMode::PositiveZero && (Class & fcNegSubnormal);
+ bool ZeroUpperNegative =
+ Mode == DenormalMode::PreserveSign && !(Class & fcPosSubnormal);
+ assert((ZeroLowerNegative || !ZeroUpperNegative) &&
+ "ZeroLower is greater than ZeroUpper.");
+ Lower = minnum(Lower, APFloat::getZero(Sem, ZeroLowerNegative));
+ Upper = maxnum(Upper, APFloat::getZero(Sem, ZeroUpperNegative));
+}
+
+/// Represent a contiguous range of values sharing the same sign.
+struct SameSignRange {
+ bool HasZero;
+ bool HasNonZero;
+ bool HasInf;
+ // The lower and upper bounds of the range (inclusive).
+ // The sign is dropped and infinities are excluded.
+ std::optional<std::pair<APFloat, APFloat>> FinitePart;
+
+ explicit SameSignRange(const APFloat &Lower, const APFloat &Upper)
+ : HasZero(Lower.isZero()), HasNonZero(!Upper.isZero()),
+ HasInf(Upper.isInfinity()) {
+ assert(!Lower.isNegative() && !Upper.isNegative() &&
+ "The sign should be dropped.");
+ assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+ "Empty set.");
+ if (!Lower.isInfinity())
+ FinitePart = {Lower,
+ HasInf ? APFloat::getLargest(Lower.getSemantics()) : Upper};
+ }
+};
+
+/// Split the range into positive and negative components.
+static void splitPosNeg(const APFloat &Lower, const APFloat &Upper,
+ std::optional<SameSignRange> &NegPart,
+ std::optional<SameSignRange> &PosPart) {
+ assert(strictCompare(Lower, Upper) != APFloat::cmpGreaterThan &&
+ "Non-NaN part is empty.");
+ if (Lower.isNegative() == Upper.isNegative()) {
+ if (Lower.isNegative())
+ NegPart = SameSignRange{abs(Upper), abs(Lower)};
+ else
+ PosPart = SameSignRange{Lower, Upper};
+ return;
+ }
+ auto &Sem = Lower.getSemantics();
+ NegPart = SameSignRange{APFloat::getZero(Sem), abs(Lower)};
+ PosPart = SameSignRange{APFloat::getZero(Sem), Upper};
+}
+
+ConstantFPRange ConstantFPRange::mul(const ConstantFPRange &Other) const {
+ auto &Sem = getSemantics();
+ bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+ ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+ if (isNaNOnly() || Other.isNaNOnly())
+ return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+ /*MayBeSNaN=*/false);
+ std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+ splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+ splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+ APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+ APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+ auto Update = [&](std::optional<SameSignRange> &LHS,
+ std::optional<SameSignRange> &RHS, bool Negative) {
+ if (!LHS || !RHS)
+ return;
+ // 0 * inf = QNaN
+ ResMayBeQNaN |= LHS->HasZero && RHS->HasInf;
+ ResMayBeQNaN |= RHS->HasZero && LHS->HasInf;
+ // NonZero * inf = inf
+ if ((LHS->HasInf && RHS->HasNonZero) || (RHS->HasInf && LHS->HasNonZero))
+ (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+ // Finite * Finite
+ if (LHS->FinitePart && RHS->FinitePart) {
+ APFloat NewLower = LHS->FinitePart->first * RHS->FinitePart->first;
+ APFloat NewUpper = LHS->FinitePart->second * RHS->FinitePart->second;
+ if (Negative) {
+ ResLower = minnum(ResLower, -NewUpper);
+ ResUpper = maxnum(ResUpper, -NewLower);
+ } else {
+ ResLower = minnum(ResLower, NewLower);
+ ResUpper = maxnum(ResUpper, NewUpper);
+ }
+ }
+ };
+ Update(LHSNeg, RHSNeg, /*Negative=*/false);
+ Update(LHSNeg, RHSPos, /*Negative=*/true);
+ Update(LHSPos, RHSNeg, /*Negative=*/true);
+ Update(LHSPos, RHSPos, /*Negative=*/false);
+ return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
+
+ConstantFPRange ConstantFPRange::div(const ConstantFPRange &Other) const {
+ auto &Sem = getSemantics();
+ bool ResMayBeQNaN = ((MayBeQNaN || MayBeSNaN) && !Other.isEmptySet()) ||
+ ((Other.MayBeQNaN || Other.MayBeSNaN) && !isEmptySet());
+ if (isNaNOnly() || Other.isNaNOnly())
+ return getNaNOnly(Sem, /*MayBeQNaN=*/ResMayBeQNaN,
+ /*MayBeSNaN=*/false);
+ std::optional<SameSignRange> LHSNeg, LHSPos, RHSNeg, RHSPos;
+ splitPosNeg(Lower, Upper, LHSNeg, LHSPos);
+ splitPosNeg(Other.Lower, Other.Upper, RHSNeg, RHSPos);
+ APFloat ResLower = APFloat::getInf(Sem, /*Negative=*/false);
+ APFloat ResUpper = APFloat::getInf(Sem, /*Negative=*/true);
+ auto Update = [&](std::optional<SameSignRange> &LHS,
+ std::optional<SameSignRange> &RHS, bool Negative) {
+ if (!LHS || !RHS)
+ return;
+ // inf / inf = QNaN 0 / 0 = QNaN
+ ResMayBeQNaN |= LHS->HasInf && RHS->HasInf;
+ ResMayBeQNaN |= LHS->HasZero && RHS->HasZero;
+ // It is not straightforward to infer HasNonZeroFinite = HasFinite &&
+ // HasNonZero. By definitions we have:
+ // HasFinite = HasNonZeroFinite || HasZero
+ // HasNonZero = HasNonZeroFinite || HasInf
+ // Since the range is contiguous, if both HasFinite and HasNonZero are true,
+ // HasNonZeroFinite must be true.
+ bool LHSHasNonZeroFinite = LHS->FinitePart && LHS->HasNonZero;
+ bool RHSHasNonZeroFinite = RHS->FinitePart && RHS->HasNonZero;
+ // inf / Finite = inf FiniteNonZero / 0 = inf
+ if ((LHS->HasInf && RHS->FinitePart) ||
+ (LHSHasNonZeroFinite && RHS->HasZero))
+ (Negative ? ResLower : ResUpper) = APFloat::getInf(Sem, Negative);
+ // Finite / inf = 0
+ if (LHS->FinitePart && RHS->HasInf) {
+ APFloat Zero = APFloat::getZero(Sem, /*Negative=*/Negative);
+ ResLower = minnum(ResLower, Zero);
+ ResUpper = maxnum(ResUpper, Zero);
+ }
+ // Finite / FiniteNonZero
+ if (LHS->FinitePart && RHSHasNonZeroFinite) {
+ assert(!RHS->FinitePart->second.isZero() &&
+ "Divisor should be non-zero.");
+ APFloat NewLower = LHS->FinitePart->first / RHS->FinitePart->second;
+ APFloat NewUpper = LHS->FinitePart->second /
+ (RHS->FinitePart->first.isZero()
+ ? APFloat::getSmallest(Sem, /*Negative=*/false)
+ : RHS->FinitePart->first);
+ if (Negative) {
+ ResLower = minnum(ResLower, -NewUpper);
+ ResUpper = maxnum(ResUpper, -NewLower);
+ } else {
+ ResLower = minnum(ResLower, NewLower);
+ ResUpper = maxnum(ResUpper, NewUpper);
+ }
+ }
+ };
+ Update(LHSNeg, RHSNeg, /*Negative=*/false);
+ Update(LHSNeg, RHSPos, /*Negative=*/true);
+ Update(LHSPos, RHSNeg, /*Negative=*/true);
+ Update(LHSPos, RHSPos, /*Negative=*/false);
+ return ConstantFPRange(ResLower, ResUpper, ResMayBeQNaN, /*MayBeSNaN=*/false);
+}
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 2c2950c..cbce8bd 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -667,8 +667,11 @@ Constant::PossibleRelocationsTy Constant::getRelocationInfo() const {
if (CE->getOpcode() == Instruction::Sub) {
ConstantExpr *LHS = dyn_cast<ConstantExpr>(CE->getOperand(0));
ConstantExpr *RHS = dyn_cast<ConstantExpr>(CE->getOperand(1));
- if (LHS && RHS && LHS->getOpcode() == Instruction::PtrToInt &&
- RHS->getOpcode() == Instruction::PtrToInt) {
+ if (LHS && RHS &&
+ (LHS->getOpcode() == Instruction::PtrToInt ||
+ LHS->getOpcode() == Instruction::PtrToAddr) &&
+ (RHS->getOpcode() == Instruction::PtrToInt ||
+ RHS->getOpcode() == Instruction::PtrToAddr)) {
Constant *LHSOp0 = LHS->getOperand(0);
Constant *RHSOp0 = RHS->getOperand(0);
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 3f1cc1e..27d8294 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -4098,15 +4098,8 @@ LLVMValueRef LLVMBuildGlobalStringPtr(LLVMBuilderRef B, const char *Str,
return wrap(unwrap(B)->CreateGlobalString(Str, Name));
}
-LLVMBool LLVMGetVolatile(LLVMValueRef MemAccessInst) {
- Value *P = unwrap(MemAccessInst);
- if (LoadInst *LI = dyn_cast<LoadInst>(P))
- return LI->isVolatile();
- if (StoreInst *SI = dyn_cast<StoreInst>(P))
- return SI->isVolatile();
- if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(P))
- return AI->isVolatile();
- return cast<AtomicCmpXchgInst>(P)->isVolatile();
+LLVMBool LLVMGetVolatile(LLVMValueRef Inst) {
+ return cast<Instruction>(unwrap(Inst))->isVolatile();
}
void LLVMSetVolatile(LLVMValueRef MemAccessInst, LLVMBool isVolatile) {
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 614c3a9..15c0198 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
+#include "llvm/IR/ProfDataUtils.h"
#include "llvm/IR/Statepoint.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
@@ -1002,6 +1003,18 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
return C;
}
+Value *IRBuilderBase::CreateSelectWithUnknownProfile(Value *C, Value *True,
+ Value *False,
+ StringRef PassName,
+ const Twine &Name) {
+ Value *Ret = CreateSelectFMF(C, True, False, {}, Name);
+ if (auto *SI = dyn_cast<SelectInst>(Ret)) {
+ setExplicitlyUnknownBranchWeightsIfProfiled(
+ *SI, *SI->getParent()->getParent(), PassName);
+ }
+ return Ret;
+}
+
Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
const Twine &Name, Instruction *MDFrom) {
return CreateSelectFMF(C, True, False, {}, Name, MDFrom);
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 88e7c44..9060a89 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -2965,8 +2965,7 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
// zext, sext -> zext, because sext can't sign extend after zext
return Instruction::ZExt;
case 11: {
- // inttoptr, ptrtoint/ptrtoaddr -> bitcast if SrcSize<=PtrSize/AddrSize
- // and SrcSize==DstSize
+ // inttoptr, ptrtoint/ptrtoaddr -> integer cast
if (!DL)
return 0;
unsigned MidSize = secondOp == Instruction::PtrToAddr
@@ -2974,10 +2973,15 @@ unsigned CastInst::isEliminableCastPair(Instruction::CastOps firstOp,
: DL->getPointerTypeSizeInBits(MidTy);
unsigned SrcSize = SrcTy->getScalarSizeInBits();
unsigned DstSize = DstTy->getScalarSizeInBits();
- // TODO: Could also produce zext or trunc here.
- if (SrcSize <= MidSize && SrcSize == DstSize)
- return Instruction::BitCast;
- return 0;
+ // If the middle size is smaller than both source and destination,
+ // an additional masking operation would be required.
+ if (MidSize < SrcSize && MidSize < DstSize)
+ return 0;
+ if (DstSize < SrcSize)
+ return Instruction::Trunc;
+ if (DstSize > SrcSize)
+ return Instruction::ZExt;
+ return Instruction::BitCast;
}
case 12:
// addrspacecast, addrspacecast -> bitcast, if SrcAS == DstAS
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index c9ff86b..c79a950 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -893,7 +893,7 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
if (GV.hasInitializer()) {
const Constant *Init = GV.getInitializer();
const ConstantArray *InitArray = dyn_cast<ConstantArray>(Init);
- Check(InitArray, "wrong initalizer for intrinsic global variable",
+ Check(InitArray, "wrong initializer for intrinsic global variable",
Init);
for (Value *Op : InitArray->operands()) {
Value *V = Op->stripPointerCasts();
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 6b65720..5ab1def 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -136,6 +136,13 @@ struct DebugCounterOwner : DebugCounter {
cl::location(this->ShouldPrintCounter),
cl::init(false),
cl::desc("Print out debug counter info after all counters accumulated")};
+ cl::opt<bool, true> PrintDebugCounterQueries{
+ "print-debug-counter-queries",
+ cl::Hidden,
+ cl::Optional,
+ cl::location(this->ShouldPrintCounterQueries),
+ cl::init(false),
+ cl::desc("Print out each query of an enabled debug counter")};
cl::opt<bool, true> BreakOnLastCount{
"debug-counter-break-on-last",
cl::Hidden,
@@ -221,31 +228,40 @@ void DebugCounter::print(raw_ostream &OS) const {
}
}
+bool DebugCounter::handleCounterIncrement(CounterInfo &Info) {
+ int64_t CurrCount = Info.Count++;
+ uint64_t CurrIdx = Info.CurrChunkIdx;
+
+ if (Info.Chunks.empty())
+ return true;
+ if (CurrIdx >= Info.Chunks.size())
+ return false;
+
+ bool Res = Info.Chunks[CurrIdx].contains(CurrCount);
+ if (BreakOnLast && CurrIdx == (Info.Chunks.size() - 1) &&
+ CurrCount == Info.Chunks[CurrIdx].End) {
+ LLVM_BUILTIN_DEBUGTRAP;
+ }
+ if (CurrCount > Info.Chunks[CurrIdx].End) {
+ Info.CurrChunkIdx++;
+
+ /// Handle consecutive blocks.
+ if (Info.CurrChunkIdx < Info.Chunks.size() &&
+ CurrCount == Info.Chunks[Info.CurrChunkIdx].Begin)
+ return true;
+ }
+ return Res;
+}
+
bool DebugCounter::shouldExecuteImpl(unsigned CounterName) {
auto &Us = instance();
auto Result = Us.Counters.find(CounterName);
if (Result != Us.Counters.end()) {
auto &CounterInfo = Result->second;
- int64_t CurrCount = CounterInfo.Count++;
- uint64_t CurrIdx = CounterInfo.CurrChunkIdx;
-
- if (CounterInfo.Chunks.empty())
- return true;
- if (CurrIdx >= CounterInfo.Chunks.size())
- return false;
-
- bool Res = CounterInfo.Chunks[CurrIdx].contains(CurrCount);
- if (Us.BreakOnLast && CurrIdx == (CounterInfo.Chunks.size() - 1) &&
- CurrCount == CounterInfo.Chunks[CurrIdx].End) {
- LLVM_BUILTIN_DEBUGTRAP;
- }
- if (CurrCount > CounterInfo.Chunks[CurrIdx].End) {
- CounterInfo.CurrChunkIdx++;
-
- /// Handle consecutive blocks.
- if (CounterInfo.CurrChunkIdx < CounterInfo.Chunks.size() &&
- CurrCount == CounterInfo.Chunks[CounterInfo.CurrChunkIdx].Begin)
- return true;
+ bool Res = Us.handleCounterIncrement(CounterInfo);
+ if (Us.ShouldPrintCounterQueries && CounterInfo.IsSet) {
+ dbgs() << "DebugCounter " << Us.RegisteredCounters[CounterName] << "="
+ << (CounterInfo.Count - 1) << (Res ? " execute" : " skip") << "\n";
}
return Res;
}
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 80fd485..549c418 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -55,12 +55,20 @@ Error SpecialCaseList::RegexMatcher::insert(StringRef Pattern,
return Error::success();
}
+void SpecialCaseList::RegexMatcher::preprocess(bool BySize) {
+ if (BySize) {
+ llvm::stable_sort(RegExes, [](const Reg &A, const Reg &B) {
+ return A.Name.size() < B.Name.size();
+ });
+ }
+}
+
void SpecialCaseList::RegexMatcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
for (const auto &R : reverse(RegExes))
if (R.Rg.match(Query))
- Cb(R.Name, R.LineNo);
+ return Cb(R.Name, R.LineNo);
}
Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
@@ -75,12 +83,20 @@ Error SpecialCaseList::GlobMatcher::insert(StringRef Pattern,
return Error::success();
}
+void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
+ if (BySize) {
+ llvm::stable_sort(Globs, [](const Glob &A, const Glob &B) {
+ return A.Name.size() < B.Name.size();
+ });
+ }
+}
+
void SpecialCaseList::GlobMatcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
for (const auto &G : reverse(Globs))
if (G.Pattern.match(Query))
- Cb(G.Name, G.LineNo);
+ return Cb(G.Name, G.LineNo);
}
SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
@@ -91,6 +107,14 @@ SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
M.emplace<RegexMatcher>();
}
+Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
+ return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
+}
+
+LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) {
+ return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
+}
+
void SpecialCaseList::Matcher::match(
StringRef Query,
llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
@@ -99,10 +123,6 @@ void SpecialCaseList::Matcher::match(
return std::visit([&](auto &V) { return V.match(Query, Cb); }, M);
}
-Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
- return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
-}
-
// TODO: Refactor this to return Expected<...>
std::unique_ptr<SpecialCaseList>
SpecialCaseList::create(const std::vector<std::string> &Paths,
@@ -141,7 +161,7 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
return false;
}
std::string ParseError;
- if (!parse(i, FileOrErr.get().get(), ParseError)) {
+ if (!parse(i, FileOrErr.get().get(), ParseError, /*OrderBySize=*/false)) {
Error = (Twine("error parsing file '") + Path + "': " + ParseError).str();
return false;
}
@@ -149,9 +169,9 @@ bool SpecialCaseList::createInternal(const std::vector<std::string> &Paths,
return true;
}
-bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
- std::string &Error) {
- if (!parse(0, MB, Error))
+bool SpecialCaseList::createInternal(const MemoryBuffer *MB, std::string &Error,
+ bool OrderBySize) {
+ if (!parse(0, MB, Error, OrderBySize))
return false;
return true;
}
@@ -174,7 +194,7 @@ SpecialCaseList::addSection(StringRef SectionStr, unsigned FileNo,
}
bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
- std::string &Error) {
+ std::string &Error, bool OrderBySize) {
unsigned long long Version = 2;
StringRef Header = MB->getBuffer();
@@ -246,6 +266,10 @@ bool SpecialCaseList::parse(unsigned FileIdx, const MemoryBuffer *MB,
return false;
}
}
+
+ for (Section &S : Sections)
+ S.preprocess(OrderBySize);
+
return true;
}
@@ -283,6 +307,13 @@ SpecialCaseList::Section::findMatcher(StringRef Prefix,
return &II->second;
}
+LLVM_ABI void SpecialCaseList::Section::preprocess(bool OrderBySize) {
+ SectionMatcher.preprocess(false);
+ for (auto &[K1, E] : Entries)
+ for (auto &[K2, M] : E)
+ M.preprocess(OrderBySize);
+}
+
unsigned SpecialCaseList::Section::getLastMatch(StringRef Prefix,
StringRef Query,
StringRef Category) const {
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 2ea3a24..afce803 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1363,9 +1363,12 @@ const Init *BinOpInit::Fold(const Record *CurRec) const {
}
case LISTSPLAT: {
const auto *Value = dyn_cast<TypedInit>(LHS);
- const auto *Size = dyn_cast<IntInit>(RHS);
- if (Value && Size) {
- SmallVector<const Init *, 8> Args(Size->getValue(), Value);
+ const auto *Count = dyn_cast<IntInit>(RHS);
+ if (Value && Count) {
+ if (Count->getValue() < 0)
+ PrintFatalError(Twine("!listsplat count ") + Count->getAsString() +
+ " is negative");
+ SmallVector<const Init *, 8> Args(Count->getValue(), Value);
return ListInit::get(Args, Value->getType());
}
break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6965116..9926a4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26196,9 +26196,10 @@ static SDValue performFlagSettingCombine(SDNode *N,
return DCI.CombineTo(N, Res, SDValue(N, 1));
}
- // Combine identical generic nodes into this node, re-using the result.
+ // Combine equivalent generic nodes into this node, re-using the result.
if (SDNode *Generic = DCI.DAG.getNodeIfExists(
- GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
+ GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
+ /*AllowCommute=*/true))
DCI.CombineTo(Generic, SDValue(N, 0));
return SDValue();
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index f110558..7e03b97 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -1360,14 +1360,24 @@ void AArch64EpilogueEmitter::emitEpilogue() {
}
bool CombineSPBump = shouldCombineCSRLocalStackBump(NumBytes);
- // Assume we can't combine the last pop with the sp restore.
- bool CombineAfterCSRBump = false;
+
+ unsigned ProloguePopSize = PrologueSaveSize;
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ // With CalleeSavesAboveFrameRecord ProloguePopSize is the amount of stack
+ // that needs to be popped until we reach the start of the SVE save area.
+ // The "FixedObject" stack occurs after the SVE area and must be popped
+ // later.
+ ProloguePopSize -= FixedObject;
AfterCSRPopSize += FixedObject;
- } else if (!CombineSPBump && PrologueSaveSize != 0) {
+ }
+
+ // Assume we can't combine the last pop with the sp restore.
+ if (!CombineSPBump && ProloguePopSize != 0) {
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
- AArch64InstrInfo::isSEHInstruction(*Pop))
+ AArch64InstrInfo::isSEHInstruction(*Pop) ||
+ (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord &&
+ isPartOfSVECalleeSaves(Pop)))
Pop = std::prev(Pop);
// Converting the last ldp to a post-index ldp is valid only if the last
// ldp's offset is 0.
@@ -1377,18 +1387,27 @@ void AArch64EpilogueEmitter::emitEpilogue() {
// may clobber), convert it to a post-index ldp.
if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
convertCalleeSaveRestoreToSPPrePostIncDec(
- Pop, DL, PrologueSaveSize, EmitCFI, MachineInstr::FrameDestroy,
- PrologueSaveSize);
+ Pop, DL, ProloguePopSize, EmitCFI, MachineInstr::FrameDestroy,
+ ProloguePopSize);
+ } else if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
+ MachineBasicBlock::iterator AfterLastPop = std::next(Pop);
+ if (AArch64InstrInfo::isSEHInstruction(*AfterLastPop))
+ ++AfterLastPop;
+ // If not, and CalleeSavesAboveFrameRecord is enabled, deallocate
+ // callee-save non-SVE registers to move the stack pointer to the start of
+ // the SVE area.
+ emitFrameOffset(MBB, AfterLastPop, DL, AArch64::SP, AArch64::SP,
+ StackOffset::getFixed(ProloguePopSize), TII,
+ MachineInstr::FrameDestroy, false, NeedsWinCFI,
+ &HasWinCFI);
} else {
- // If not, make sure to emit an add after the last ldp.
+ // Otherwise, make sure to emit an add after the last ldp.
// We're doing this by transferring the size to be restored from the
// adjustment *before* the CSR pops to the adjustment *after* the CSR
// pops.
- AfterCSRPopSize += PrologueSaveSize;
- CombineAfterCSRBump = true;
+ AfterCSRPopSize += ProloguePopSize;
}
}
-
// Move past the restores of the callee-saved registers.
// If we plan on combining the sp bump of the local stack size and the callee
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1419,6 +1438,17 @@ void AArch64EpilogueEmitter::emitEpilogue() {
--SEHEpilogueStartI;
}
+ // Determine the ranges of SVE callee-saves. This is done before emitting any
+ // code at the end of the epilogue (for Swift async), which can get in the way
+ // of finding SVE callee-saves with CalleeSavesAboveFrameRecord.
+ auto [PPR, ZPR] = getSVEStackFrameSizes();
+ auto [PPRRange, ZPRRange] = partitionSVECS(
+ MBB,
+ SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
+ ? MBB.getFirstTerminator()
+ : FirstGPRRestoreI,
+ PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
+
if (HasFP && AFI->hasSwiftAsyncContext())
emitSwiftAsyncContextFramePointer(EpilogueEndI, DL);
@@ -1441,14 +1471,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
- auto [PPR, ZPR] = getSVEStackFrameSizes();
- auto [PPRRange, ZPRRange] = partitionSVECS(
- MBB,
- SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord
- ? MBB.getFirstTerminator()
- : FirstGPRRestoreI,
- PPR.CalleeSavesSize, ZPR.CalleeSavesSize, /*IsEpilogue=*/true);
-
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
StackOffset SVEStackSize =
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
@@ -1467,16 +1489,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
NeedsWinCFI, &HasWinCFI);
}
- // Deallocate callee-save non-SVE registers.
- emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(AFI->getCalleeSavedStackSize()), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
- // Deallocate fixed objects.
- emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
- StackOffset::getFixed(FixedObject), TII,
- MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
-
// Deallocate callee-save SVE registers.
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
@@ -1619,7 +1631,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(AfterCSRPopSize), TII, MachineInstr::FrameDestroy,
false, NeedsWinCFI, &HasWinCFI, EmitCFI,
- StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
+ StackOffset::getFixed(AfterCSRPopSize - ArgumentStackToRestore));
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index dbe74b1..5700468 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -2394,15 +2394,19 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
(TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
TII->isTRANS(MI)))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
- Result = true;
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) {
+ // Some memory instructions may be marked as VALU (e.g. BUFFER_LOAD_*_LDS).
+ // For our purposes, these shall not be classified as VALU as this results
+ // in unexpected behavior.
+ Result = !MI.mayLoadOrStore();
+ }
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
TII->isSALU(MI))
- Result = true;
+ Result = !MI.mayLoadOrStore();
else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
TII->isMFMAorWMMA(MI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index a44af5f..1b559a6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2833,8 +2833,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
R = getMad(DAG, DL, VT, YH, CH, Mad1);
}
- const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
- (Flags.hasNoInfs() || Options.NoInfsFPMath);
+ const bool IsFiniteOnly =
+ (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
// TODO: Check if known finite from source value.
if (!IsFiniteOnly) {
@@ -3161,9 +3161,8 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
- const auto &Options = getTargetMachine().Options;
- if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+ if (!Flags.hasNoInfs()) {
SDValue OverflowCheckConst =
DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
SDValue Overflow =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index ee466ca..596a895 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3575,7 +3575,7 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
const bool IsFiniteOnly =
(MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
- (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+ MI.getFlag(MachineInstr::FmNoInfs);
if (!IsFiniteOnly) {
// Expand isfinite(x) => fabs(x) < inf
@@ -3864,9 +3864,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
R = B.buildSelect(Ty, Underflow, Zero, R);
- const auto &Options = MF.getTarget().Options;
-
- if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+ if (!(Flags & MachineInstr::FmNoInfs)) {
auto OverflowCheckConst =
B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 71494be..4e11c4f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -14,6 +14,7 @@
#include "GCNRegPressure.h"
#include "AMDGPU.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -459,10 +460,14 @@ LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI) {
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
auto Reg = Register::index2VirtReg(I);
+ if (RegKind != GCNRegPressure::TOTAL_KINDS &&
+ GCNRegPressure::getRegKind(Reg, MRI) != RegKind)
+ continue;
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -986,3 +991,128 @@ bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
#undef PFX
}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void llvm::dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI) {
+
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ auto &OS = dbgs();
+ const char *RegName = GCNRegPressure::getName(Kind);
+
+ unsigned MaxNumRegs = 0;
+ const MachineInstr *MaxPressureMI = nullptr;
+ GCNUpwardRPTracker RPT(LIS);
+ for (const MachineBasicBlock &MBB : MF) {
+ RPT.reset(MRI, LIS.getSlotIndexes()->getMBBEndIdx(&MBB).getPrevSlot());
+ for (const MachineInstr &MI : reverse(MBB)) {
+ RPT.recede(MI);
+ unsigned NumRegs = RPT.getMaxPressure().getNumRegs(Kind);
+ if (NumRegs > MaxNumRegs) {
+ MaxNumRegs = NumRegs;
+ MaxPressureMI = &MI;
+ }
+ }
+ }
+
+ SlotIndex MISlot = LIS.getInstructionIndex(*MaxPressureMI);
+
+ // Max pressure can occur at either the early-clobber or register slot.
+ // Choose the maximum liveset between both slots. This is ugly but this is
+ // diagnostic code.
+ SlotIndex ECSlot = MISlot.getRegSlot(true);
+ SlotIndex RSlot = MISlot.getRegSlot(false);
+ GCNRPTracker::LiveRegSet ECLiveSet = getLiveRegs(ECSlot, LIS, MRI, Kind);
+ GCNRPTracker::LiveRegSet RLiveSet = getLiveRegs(RSlot, LIS, MRI, Kind);
+ unsigned ECNumRegs = getRegPressure(MRI, ECLiveSet).getNumRegs(Kind);
+ unsigned RNumRegs = getRegPressure(MRI, RLiveSet).getNumRegs(Kind);
+ GCNRPTracker::LiveRegSet *LiveSet =
+ ECNumRegs > RNumRegs ? &ECLiveSet : &RLiveSet;
+ SlotIndex MaxPressureSlot = ECNumRegs > RNumRegs ? ECSlot : RSlot;
+ assert(getRegPressure(MRI, *LiveSet).getNumRegs(Kind) == MaxNumRegs);
+
+ // Split live registers into single-def and multi-def sets.
+ GCNRegPressure SDefPressure, MDefPressure;
+ SmallVector<Register, 16> SDefRegs, MDefRegs;
+ for (auto [Reg, LaneMask] : *LiveSet) {
+ assert(GCNRegPressure::getRegKind(Reg, MRI) == Kind);
+ LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.getNumValNums() == 1 ||
+ (LI.hasSubRanges() &&
+ llvm::all_of(LI.subranges(), [](const LiveInterval::SubRange &SR) {
+ return SR.getNumValNums() == 1;
+ }))) {
+ SDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ SDefRegs.push_back(Reg);
+ } else {
+ MDefPressure.inc(Reg, LaneBitmask::getNone(), LaneMask, MRI);
+ MDefRegs.push_back(Reg);
+ }
+ }
+ unsigned SDefNumRegs = SDefPressure.getNumRegs(Kind);
+ unsigned MDefNumRegs = MDefPressure.getNumRegs(Kind);
+ assert(SDefNumRegs + MDefNumRegs == MaxNumRegs);
+
+ auto printLoc = [&](const MachineBasicBlock *MBB, SlotIndex SI) {
+ return Printable([&, MBB, SI](raw_ostream &OS) {
+ OS << SI << ':' << printMBBReference(*MBB);
+ if (MLI)
+ if (const MachineLoop *ML = MLI->getLoopFor(MBB))
+ OS << " (LoopHdr " << printMBBReference(*ML->getHeader())
+ << ", Depth " << ML->getLoopDepth() << ")";
+ });
+ };
+
+ auto PrintRegInfo = [&](Register Reg, LaneBitmask LiveMask) {
+ GCNRegPressure RegPressure;
+ RegPressure.inc(Reg, LaneBitmask::getNone(), LiveMask, MRI);
+ OS << " " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(MRI.getRegClass(Reg)) << ", LiveMask "
+ << PrintLaneMask(LiveMask) << " (" << RegPressure.getNumRegs(Kind) << ' '
+ << RegName << "s)\n";
+
+ // Use std::map to sort def/uses by SlotIndex.
+ std::map<SlotIndex, const MachineInstr *> Instrs;
+ for (const MachineInstr &MI : MRI.reg_nodbg_instructions(Reg)) {
+ Instrs[LIS.getInstructionIndex(MI).getRegSlot()] = &MI;
+ }
+
+ for (const auto &[SI, MI] : Instrs) {
+ OS << " ";
+ if (MI->definesRegister(Reg, TRI))
+ OS << "def ";
+ if (MI->readsRegister(Reg, TRI))
+ OS << "use ";
+ OS << printLoc(MI->getParent(), SI) << ": " << *MI;
+ }
+ };
+
+ OS << "\n*** Register pressure info (" << RegName << "s) for " << MF.getName()
+ << " ***\n";
+ OS << "Max pressure is " << MaxNumRegs << ' ' << RegName << "s at "
+ << printLoc(MaxPressureMI->getParent(), MaxPressureSlot) << ": "
+ << *MaxPressureMI;
+
+ OS << "\nLive registers with single definition (" << SDefNumRegs << ' '
+ << RegName << "s):\n";
+
+ // Sort SDefRegs by number of uses (smallest first)
+ llvm::sort(SDefRegs, [&](Register A, Register B) {
+ return std::distance(MRI.use_nodbg_begin(A), MRI.use_nodbg_end()) <
+ std::distance(MRI.use_nodbg_begin(B), MRI.use_nodbg_end());
+ });
+
+ for (const Register Reg : SDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+
+ OS << "\nLive registers with multiple definitions (" << MDefNumRegs << ' '
+ << RegName << "s):\n";
+ for (const Register Reg : MDefRegs) {
+ PrintRegInfo(Reg, LiveSet->lookup(Reg));
+ }
+}
+#endif
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 898d1ff..979a8b0 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -31,6 +31,12 @@ class SlotIndex;
struct GCNRegPressure {
enum RegKind { SGPR, VGPR, AGPR, AVGPR, TOTAL_KINDS };
+ static constexpr const char *getName(RegKind Kind) {
+ const char *Names[] = {"SGPR", "VGPR", "AGPR", "AVGPR"};
+ assert(Kind < TOTAL_KINDS);
+ return Names[Kind];
+ }
+
GCNRegPressure() {
clear();
}
@@ -41,6 +47,11 @@ struct GCNRegPressure {
void clear() { std::fill(&Value[0], &Value[ValueArraySize], 0); }
+ unsigned getNumRegs(RegKind Kind) const {
+ assert(Kind < TOTAL_KINDS);
+ return Value[Kind];
+ }
+
/// \returns the SGPR32 pressure
unsigned getSGPRNum() const { return Value[SGPR]; }
/// \returns the aggregated ArchVGPR32, AccVGPR32, and Pseudo AVGPR pressure
@@ -138,6 +149,12 @@ struct GCNRegPressure {
void dump() const;
+ static RegKind getRegKind(unsigned Reg, const MachineRegisterInfo &MRI) {
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const SIRegisterInfo *STI = static_cast<const SIRegisterInfo *>(TRI);
+ return (RegKind)getRegKind(MRI.getRegClass(Reg), STI);
+ }
+
private:
static constexpr unsigned ValueArraySize = TOTAL_KINDS * 2;
@@ -294,8 +311,10 @@ public:
}
};
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
+GCNRPTracker::LiveRegSet
+getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI,
+ GCNRegPressure::RegKind RegKind = GCNRegPressure::TOTAL_KINDS);
////////////////////////////////////////////////////////////////////////////////
// GCNUpwardRPTracker
@@ -428,9 +447,6 @@ LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
const MachineRegisterInfo &MRI,
LaneBitmask LaneMaskFilter = LaneBitmask::getAll());
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
- const MachineRegisterInfo &MRI);
-
/// creates a map MachineInstr -> LiveRegSet
/// R - range of iterators on instructions
/// After - upon entry or exit of every instruction
@@ -524,6 +540,11 @@ public:
}
};
+LLVM_ABI void dumpMaxRegPressure(MachineFunction &MF,
+ GCNRegPressure::RegKind Kind,
+ LiveIntervals &LIS,
+ const MachineLoopInfo *MLI);
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index bdc0810..58482ea 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -69,6 +69,21 @@ static cl::opt<bool> GCNTrackers(
cl::desc("Use the AMDGPU specific RPTrackers during scheduling"),
cl::init(false));
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+#define DUMP_MAX_REG_PRESSURE
+static cl::opt<bool> PrintMaxRPRegUsageBeforeScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-before-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure before scheduling."),
+ cl::init(false));
+
+static cl::opt<bool> PrintMaxRPRegUsageAfterScheduler(
+ "amdgpu-print-max-reg-pressure-regusage-after-scheduler", cl::Hidden,
+ cl::desc("Print a list of live registers along with their def/uses at the "
+ "point of maximum register pressure after scheduling."),
+ cl::init(false));
+#endif
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -960,6 +975,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
RegionLiveOuts.buildLiveRegMap();
}
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageBeforeScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
+
GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
@@ -995,6 +1018,14 @@ void GCNScheduleDAGMILive::runSchedStages() {
Stage->finalizeGCNSchedStage();
}
+
+#ifdef DUMP_MAX_REG_PRESSURE
+ if (PrintMaxRPRegUsageAfterScheduler) {
+ dumpMaxRegPressure(MF, GCNRegPressure::VGPR, *LIS, MLI);
+ dumpMaxRegPressure(MF, GCNRegPressure::SGPR, *LIS, MLI);
+ LIS->dump();
+ }
+#endif
}
#ifndef NDEBUG
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 64e34db..5f6d742 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -260,8 +260,12 @@ class NSAHelper {
}
class MIMGNSAHelper<int num_addrs,
- list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
- : NSAHelper<> {
+ list<RegisterOperand> addr_types_in=[]>
+ : NSAHelper<> {
+ list<RegisterOperand> addr_types =
+ !if(!empty(addr_types_in), !listsplat(VGPROp_32, num_addrs),
+ addr_types_in);
+
list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
let AddrIns = !dag(ins, addr_types, AddrAsmNames);
let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
@@ -358,7 +362,7 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
// Base class for all NSA MIMG instructions.
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[],
+ list<RegisterOperand> addr_types=[],
RegisterOperand LastAddrRC = VGPROp_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
let SubtargetPredicate = isGFX11Only;
@@ -378,7 +382,7 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
}
class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[]>
+ list<RegisterOperand> addr_types=[]>
: VIMAGE<outs, dns>, VIMAGEe<op> {
let SubtargetPredicate = isGFX12Plus;
let AssemblerPredicate = isGFX12Plus;
@@ -1521,12 +1525,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16, bit isDual, bit isBVH8> {
int VAddrDwords = !srl(Size, 5);
int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
- RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
- list<RegisterClass> GFX11PlusAddrTypes =
- !cond(isBVH8 : [node_ptr_type, VReg_64, VReg_96, VReg_96, VGPR_32],
- isDual : [node_ptr_type, VReg_64, VReg_96, VReg_96, VReg_64],
- IsA16 : [node_ptr_type, VGPR_32, VReg_96, VReg_96],
- true : [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+ RegisterOperand node_ptr_type = !if(Is64, VGPROp_64, VGPROp_32);
+ list<RegisterOperand> GFX11PlusAddrTypes =
+ !cond(isBVH8 : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_32],
+ isDual : [node_ptr_type, VGPROp_64, VGPROp_96, VGPROp_96, VGPROp_64],
+ IsA16 : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96],
+ true : [node_ptr_type, VGPROp_32, VGPROp_96, VGPROp_96, VGPROp_96]);
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterOperand AddrRC>
@@ -1552,7 +1556,7 @@ class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterOperand AddrRC>
}
class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "GFX11",
addr_types> {
let InOperandList = !con(nsah.AddrIns, (ins SReg_128_XNULL:$srsrc, A16:$a16));
@@ -1561,7 +1565,7 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
bit isDual, bit isBVH8,
- list<RegisterClass> addr_types>
+ list<RegisterOperand> addr_types>
: VIMAGE_gfx12<op.GFX12, !if(!or(isDual, isBVH8),
(outs VReg_320:$vdata, VReg_96:$ray_origin_out,
VReg_96:$ray_dir_out),
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 5e27b37..6dcbced 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1019,7 +1019,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
// SMEM and VMEM operations. So there will never be
// outstanding address translations for both SMEM and
// VMEM at the same time.
- setScoreLB(T, CurrScore - 1);
+ setScoreLB(T, getScoreUB(T) - 1);
PendingEvents &= ~(1 << OtherEvent);
}
for (const MachineOperand &Op : Inst.all_uses())
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ec5c5bb3..a44a247 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -899,7 +899,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
if (DestReg == AMDGPU::VCC) {
- if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+ if (AMDGPU::SReg_64_EncodableRegClass.contains(SrcReg)) {
BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
.addReg(SrcReg, getKillRegState(KillSrc));
} else {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index eac9fd4..27e5ee9c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3726,6 +3726,23 @@ def : GCNPat <
} // End foreach Ty = ...
} // End AddedComplexity = 1
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (zext i16:$src_lo)),
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi)))))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+def : GCNPat<
+ (i32 (DivergentBinFrag<or>
+ (i32 (bitconvert (v2i16 (build_vector (i16 0), (i16 VGPR_16:$src_hi))))),
+ (i32 (zext i16:$src_lo))
+ )),
+ (REG_SEQUENCE VGPR_32, $src_lo, lo16, $src_hi, hi16)
+>;
+}
+
let True16Predicate = UseRealTrue16Insts in
def : GCNPat <
(v2i16 (DivergentBinFrag<build_vector> (i16 undef), (i16 (trunc i32:$src1)))),
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bef4868..7e7ee75 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -280,6 +280,10 @@ static unsigned getTcgen05LdOpcode(unsigned IID, bool enablePack) {
}
void NVPTXDAGToDAGISel::SelectTcgen05Ld(SDNode *N, bool hasOffset) {
+ if (!Subtarget->hasTcgen05InstSupport())
+ report_fatal_error(
+ "tcgen05.ld is not supported on this architecture variant");
+
SDLoc DL(N);
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -2136,6 +2140,10 @@ static unsigned getTcgen05StOpcode(unsigned IID, bool enableUnpack) {
}
void NVPTXDAGToDAGISel::SelectTcgen05St(SDNode *N, bool hasOffset) {
+ if (!Subtarget->hasTcgen05InstSupport())
+ report_fatal_error(
+ "tcgen05.st is not supported on this architecture variant");
+
SDLoc DL(N);
unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 6c14cf0..dfde0cc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -101,6 +101,22 @@ def PrmtMode : Operand<i32> {
// NVPTX Instruction Predicate Definitions
//===----------------------------------------------------------------------===//
+// Checks PTX version and family-specific and architecture-specific SM versions.
+// For example, sm_100{f/a} and any future variants in the same family will match
+// for any PTX version greater than or equal to `PTXVersion`.
+class PTXWithFamilySMs<int PTXVersion, list<int> SMVersions> :
+ Predicate<"Subtarget->hasPTXWithFamilySMs(" # PTXVersion # ", {" #
+ !interleave(SMVersions, ", ") # "})">;
+
+// Checks PTX version and architecture-specific SM versions.
+// For example, sm_100{a} will match for any PTX version
+// greater than or equal to `PTXVersion`.
+class PTXWithAccelSMs<int PTXVersion, list<int> SMVersions> :
+ Predicate<"Subtarget->hasPTXWithAccelSMs(" # PTXVersion # ", {" #
+ !interleave(SMVersions, ", ") # "})">;
+
+// Helper predicate to call a subtarget method.
+class callSubtarget<string SubtargetMethod> : Predicate<"Subtarget->" # SubtargetMethod # "()">;
def hasAtomAddF64 : Predicate<"Subtarget->hasAtomAddF64()">;
def hasAtomScope : Predicate<"Subtarget->hasAtomScope()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index a8b854f..22cf3a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -5103,8 +5103,8 @@ let Predicates = [hasSM<90>, hasPTX<78>] in {
def EXIT : NullaryInst<"exit", int_nvvm_exit>;
// Tcgen05 intrinsics
-let isConvergent = true, Predicates = [hasTcgen05Instructions] in {
-
+let isConvergent = true in {
+let Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
multiclass TCGEN05_ALLOC_INTR<string AS, string num, Intrinsic Intr> {
def "" : BasicNVPTXInst<(outs),
(ins ADDR:$dst, B32:$ncols),
@@ -5156,15 +5156,6 @@ defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">;
defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">;
defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">;
-multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
- def "" : BasicNVPTXInst<(outs),
- (ins ADDR:$tmem_addr),
- "tcgen05.shift.cta_group::" # num # ".down",
- [(Intr addr:$tmem_addr)]>;
-}
-defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
-defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
-
multiclass TCGEN05_CP_INTR<string shape, string src_fmt, string mc = ""> {
defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16");
defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret;
@@ -5195,9 +5186,22 @@ foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in {
defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">;
defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">;
}
+} // Predicates
+
+let Predicates = [callSubtarget<"hasTcgen05ShiftSupport">] in {
+multiclass TCGEN05_SHIFT_INTR<string num, Intrinsic Intr> {
+ def "" : BasicNVPTXInst<(outs),
+ (ins ADDR:$tmem_addr),
+ "tcgen05.shift.cta_group::" # num # ".down",
+ [(Intr addr:$tmem_addr)]>;
+}
+defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>;
+defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>;
+} // Predicates
+
} // isConvergent
-let hasSideEffects = 1, Predicates = [hasTcgen05Instructions] in {
+let hasSideEffects = 1, Predicates = [callSubtarget<"hasTcgen05InstSupport">] in {
def tcgen05_fence_before_thread_sync: NullaryInst<
"tcgen05.fence::before_thread_sync", int_nvvm_tcgen05_fence_before_thread_sync>;
@@ -5231,8 +5235,7 @@ class TCGEN05_LDST_REGINFO<int Veclen> {
//
class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
- NVPTXInst<(outs), (ins), "?", []>,
- Requires<[hasTcgen05Instructions]> {
+ NVPTXInst<(outs), (ins), "?", []> {
TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
@@ -5256,8 +5259,7 @@ class TCGEN05_LD_INST<string Shape, int Num, bit Pack> :
//
class TCGEN05_ST_INST<string Shape, int Num, bit Unpack> :
- NVPTXInst<(outs), (ins), "?", []>,
- Requires<[hasTcgen05Instructions]> {
+ NVPTXInst<(outs), (ins), "?", []> {
TCGEN05_LDST_REGINFO Info = TCGEN05_LDST_REGINFO<
NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.veclen>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index c548967..989be50 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -72,6 +72,40 @@ const SelectionDAGTargetInfo *NVPTXSubtarget::getSelectionDAGInfo() const {
return TSInfo.get();
}
+bool NVPTXSubtarget::hasPTXWithFamilySMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const {
+ unsigned PTXVer = getPTXVersion();
+ if (!hasFamilySpecificFeatures() || PTXVer < PTXVersion)
+ return false;
+
+ unsigned SMVer = getSmVersion();
+ return llvm::any_of(SMVersions, [&](unsigned SM) {
+ // sm_101 is a different family, never group it with sm_10x.
+ if (SMVer == 101 || SM == 101)
+ return SMVer == SM &&
+ // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+ // supported.
+ !(PTXVer >= 90 && SMVer == 101);
+
+ return getSmFamilyVersion() == SM / 10 && SMVer >= SM;
+ });
+}
+
+bool NVPTXSubtarget::hasPTXWithAccelSMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const {
+ unsigned PTXVer = getPTXVersion();
+ if (!hasArchAccelFeatures() || PTXVer < PTXVersion)
+ return false;
+
+ unsigned SMVer = getSmVersion();
+ return llvm::any_of(SMVersions, [&](unsigned SM) {
+ return SMVer == SM &&
+ // PTX 9.0 and later renamed sm_101 to sm_110, so sm_101 is not
+ // supported.
+ !(PTXVer >= 90 && SMVer == 101);
+ });
+}
+
bool NVPTXSubtarget::allowFP16Math() const {
return hasFP16Math() && NoF16Math == false;
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e81c56b..194dbdc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -73,6 +73,18 @@ public:
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
+ // Checks PTX version and family-specific and architecture-specific SM
+ // versions. For example, sm_100{f/a} and any future variants in the same
+ // family will match for any PTX version greater than or equal to
+ // `PTXVersion`.
+ bool hasPTXWithFamilySMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const;
+ // Checks PTX version and architecture-specific SM versions.
+ // For example, sm_100{a} will match for any PTX version greater than or equal
+ // to `PTXVersion`.
+ bool hasPTXWithAccelSMs(unsigned PTXVersion,
+ ArrayRef<unsigned> SMVersions) const;
+
bool has256BitVectorLoadStore(unsigned AS) const {
return SmVersion >= 100 && PTXVersion >= 88 &&
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
@@ -127,6 +139,27 @@ public:
return HasTcgen05 && PTXVersion >= MinPTXVersion;
}
+ // Checks following instructions support:
+ // - tcgen05.ld/st
+ // - tcgen05.alloc/dealloc/relinquish
+ // - tcgen05.cp
+ // - tcgen05.fence/wait
+ // - tcgen05.commit
+ bool hasTcgen05InstSupport() const {
+ // sm_101 renamed to sm_110 in PTX 9.0
+ return hasPTXWithFamilySMs(90, {100, 110}) ||
+ hasPTXWithFamilySMs(88, {100, 101}) ||
+ hasPTXWithAccelSMs(86, {100, 101});
+ }
+
+ // Checks tcgen05.shift instruction support.
+ bool hasTcgen05ShiftSupport() const {
+ // sm_101 renamed to sm_110 in PTX 9.0
+ return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
+ hasPTXWithAccelSMs(88, {100, 101, 103}) ||
+ hasPTXWithAccelSMs(86, {100, 101});
+ }
+
bool hasTcgen05MMAScaleInputDImm() const {
return FullSmVersion == 1003 && PTXVersion >= 86;
}
@@ -158,6 +191,7 @@ public:
bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
unsigned int getFullSmVersion() const { return FullSmVersion; }
unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
+ unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
// GPUs with "a" suffix have architecture-accelerated features that are
// supported on the specified architecture only, hence such targets do not
// follow the onion layer model. hasArchAccelFeatures() allows distinguishing
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 4b54231..8851a0f 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1659,6 +1659,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
return generateImmOutOfRangeError(
Operands, ErrorInfo, -1, (1 << 5) - 1,
"immediate must be non-zero in the range");
+ case Match_InvalidXSfmmVType: {
+ SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+ return generateXSfmmVTypeError(ErrorLoc);
+ }
case Match_InvalidVTypeI: {
SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
return generateVTypeError(ErrorLoc);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 70b7c43..e75dfe3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -142,6 +142,22 @@ enum {
ReadsPastVLShift = DestEEWShift + 2,
ReadsPastVLMask = 1ULL << ReadsPastVLShift,
+
+ // 0 -> Don't care about altfmt bit in VTYPE.
+ // 1 -> Is not altfmt.
+ // 2 -> Is altfmt(BF16).
+ AltFmtTypeShift = ReadsPastVLShift + 1,
+ AltFmtTypeMask = 3ULL << AltFmtTypeShift,
+
+ // XSfmmbase
+ HasTWidenOpShift = AltFmtTypeShift + 2,
+ HasTWidenOpMask = 1ULL << HasTWidenOpShift,
+
+ HasTMOpShift = HasTWidenOpShift + 1,
+ HasTMOpMask = 1ULL << HasTMOpShift,
+
+ HasTKOpShift = HasTMOpShift + 1,
+ HasTKOpMask = 1ULL << HasTKOpShift,
};
// Helper functions to read TSFlags.
@@ -183,6 +199,11 @@ static inline bool hasRoundModeOp(uint64_t TSFlags) {
return TSFlags & HasRoundModeOpMask;
}
+enum class AltFmtType { DontCare, NotAltFmt, AltFmt };
+static inline AltFmtType getAltFmtType(uint64_t TSFlags) {
+ return static_cast<AltFmtType>((TSFlags & AltFmtTypeMask) >> AltFmtTypeShift);
+}
+
/// \returns true if this instruction uses vxrm
static inline bool usesVXRM(uint64_t TSFlags) { return TSFlags & UsesVXRMMask; }
@@ -204,11 +225,47 @@ static inline bool readsPastVL(uint64_t TSFlags) {
return TSFlags & ReadsPastVLMask;
}
+// XSfmmbase
+static inline bool hasTWidenOp(uint64_t TSFlags) {
+ return TSFlags & HasTWidenOpMask;
+}
+
+static inline bool hasTMOp(uint64_t TSFlags) { return TSFlags & HasTMOpMask; }
+
+static inline bool hasTKOp(uint64_t TSFlags) { return TSFlags & HasTKOpMask; }
+
+static inline unsigned getTNOpNum(const MCInstrDesc &Desc) {
+ const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasVLOp(TSFlags));
+ unsigned Offset = 3;
+ if (hasTKOp(TSFlags))
+ Offset = 4;
+ return Desc.getNumOperands() - Offset;
+}
+
+static inline unsigned getTMOpNum(const MCInstrDesc &Desc) {
+ const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasTMOp(TSFlags));
+ if (hasTKOp(TSFlags))
+ return Desc.getNumOperands() - 5;
+ // vtzero.t
+ return Desc.getNumOperands() - 4;
+}
+
+static inline unsigned getTKOpNum(const MCInstrDesc &Desc) {
+ [[maybe_unused]] const uint64_t TSFlags = Desc.TSFlags;
+ assert(hasTWidenOp(TSFlags) && hasTKOp(TSFlags));
+ return Desc.getNumOperands() - 3;
+}
+
static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
const uint64_t TSFlags = Desc.TSFlags;
// This method is only called if we expect to have a VL operand, and all
// instructions with VL also have SEW.
assert(hasSEWOp(TSFlags) && hasVLOp(TSFlags));
+ // In Xsfmmbase, TN is an alias for VL, so here we use the same TSFlags bit.
+ if (hasTWidenOp(TSFlags))
+ return getTNOpNum(Desc);
unsigned Offset = 2;
if (hasVecPolicyOp(TSFlags))
Offset = 3;
@@ -226,7 +283,7 @@ static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
const uint64_t TSFlags = Desc.TSFlags;
assert(hasSEWOp(TSFlags));
unsigned Offset = 1;
- if (hasVecPolicyOp(TSFlags))
+ if (hasVecPolicyOp(TSFlags) || hasTWidenOp(TSFlags))
Offset = 2;
return Desc.getNumOperands() - Offset;
}
@@ -243,6 +300,9 @@ static inline int getFRMOpNum(const MCInstrDesc &Desc) {
if (!hasRoundModeOp(TSFlags) || usesVXRM(TSFlags))
return -1;
+ if (hasTWidenOp(TSFlags) && hasTMOp(TSFlags))
+ return getTMOpNum(Desc) - 1;
+
// The operand order
// --------------------------------------
// | n-1 (if any) | n-2 | n-3 | n-4 |
@@ -385,7 +445,9 @@ enum OperandType : unsigned {
OPERAND_SEW_MASK,
// Vector rounding mode for VXRM or FRM.
OPERAND_VEC_RM,
- OPERAND_LAST_RISCV_IMM = OPERAND_VEC_RM,
+ // Vtype operand for XSfmm extension.
+ OPERAND_XSFMM_VTYPE,
+ OPERAND_LAST_RISCV_IMM = OPERAND_XSFMM_VTYPE,
// Operand is either a register or uimm5, this is used by V extension pseudo
// instructions to represent a value that be passed as AVL to either vsetvli
// or vsetivli.
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 437022f..974252a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3353,14 +3353,20 @@ bool RISCVDAGToDAGISel::selectSETCC(SDValue N, ISD::CondCode ExpectedCCVal,
0);
return true;
}
- // If the RHS is [-2047,2048], we can use addi with -RHS to produce 0 if the
- // LHS is equal to the RHS and non-zero otherwise.
+ // If the RHS is [-2047,2048], we can use addi/addiw with -RHS to produce 0
+ // if the LHS is equal to the RHS and non-zero otherwise.
if (isInt<12>(CVal) || CVal == 2048) {
- Val = SDValue(
- CurDAG->getMachineNode(
- RISCV::ADDI, DL, N->getValueType(0), LHS,
- CurDAG->getSignedTargetConstant(-CVal, DL, N->getValueType(0))),
- 0);
+ unsigned Opc = RISCV::ADDI;
+ if (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+ cast<VTSDNode>(LHS.getOperand(1))->getVT() == MVT::i32) {
+ Opc = RISCV::ADDIW;
+ LHS = LHS.getOperand(0);
+ }
+
+ Val = SDValue(CurDAG->getMachineNode(Opc, DL, N->getValueType(0), LHS,
+ CurDAG->getSignedTargetConstant(
+ -CVal, DL, N->getValueType(0))),
+ 0);
return true;
}
if (isPowerOf2_64(CVal) && Subtarget->hasStdExtZbs()) {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index cf8d120..1b7cb9b 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -168,10 +168,13 @@ struct DemandedFields {
// If this is true, we demand that VTYPE is set to some legal state, i.e. that
// vill is unset.
bool VILL = false;
+ bool TWiden = false;
+ bool AltFmt = false;
// Return true if any part of VTYPE was used
bool usedVTYPE() const {
- return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL;
+ return SEW || LMUL || SEWLMULRatio || TailPolicy || MaskPolicy || VILL ||
+ TWiden || AltFmt;
}
// Return true if any property of VL was used
@@ -187,6 +190,8 @@ struct DemandedFields {
TailPolicy = true;
MaskPolicy = true;
VILL = true;
+ TWiden = true;
+ AltFmt = true;
}
// Mark all VL properties as demanded
@@ -212,6 +217,8 @@ struct DemandedFields {
TailPolicy |= B.TailPolicy;
MaskPolicy |= B.MaskPolicy;
VILL |= B.VILL;
+ AltFmt |= B.AltFmt;
+ TWiden |= B.TWiden;
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -258,7 +265,9 @@ struct DemandedFields {
OS << "SEWLMULRatio=" << SEWLMULRatio << ", ";
OS << "TailPolicy=" << TailPolicy << ", ";
OS << "MaskPolicy=" << MaskPolicy << ", ";
- OS << "VILL=" << VILL;
+ OS << "VILL=" << VILL << ", ";
+ OS << "AltFmt=" << AltFmt << ", ";
+ OS << "TWiden=" << TWiden;
OS << "}";
}
#endif
@@ -328,6 +337,15 @@ static bool areCompatibleVTYPEs(uint64_t CurVType, uint64_t NewVType,
if (Used.MaskPolicy && RISCVVType::isMaskAgnostic(CurVType) !=
RISCVVType::isMaskAgnostic(NewVType))
return false;
+ if (Used.TWiden && (RISCVVType::hasXSfmmWiden(CurVType) !=
+ RISCVVType::hasXSfmmWiden(NewVType) ||
+ (RISCVVType::hasXSfmmWiden(CurVType) &&
+ RISCVVType::getXSfmmWiden(CurVType) !=
+ RISCVVType::getXSfmmWiden(NewVType))))
+ return false;
+ if (Used.AltFmt &&
+ RISCVVType::isAltFmt(CurVType) != RISCVVType::isAltFmt(NewVType))
+ return false;
return true;
}
@@ -479,6 +497,11 @@ DemandedFields getDemanded(const MachineInstr &MI, const RISCVSubtarget *ST) {
Res.TailPolicy = false;
}
+ Res.AltFmt = RISCVII::getAltFmtType(MI.getDesc().TSFlags) !=
+ RISCVII::AltFmtType::DontCare;
+ Res.TWiden = RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI);
+
return Res;
}
@@ -510,6 +533,8 @@ class VSETVLIInfo {
uint8_t TailAgnostic : 1;
uint8_t MaskAgnostic : 1;
uint8_t SEWLMULRatioOnly : 1;
+ uint8_t AltFmt : 1;
+ uint8_t TWiden : 3;
public:
VSETVLIInfo()
@@ -586,6 +611,8 @@ public:
RISCVVType::VLMUL getVLMUL() const { return VLMul; }
bool getTailAgnostic() const { return TailAgnostic; }
bool getMaskAgnostic() const { return MaskAgnostic; }
+ bool getAltFmt() const { return AltFmt; }
+ unsigned getTWiden() const { return TWiden; }
bool hasNonZeroAVL(const LiveIntervals *LIS) const {
if (hasAVLImm())
@@ -647,21 +674,31 @@ public:
SEW = RISCVVType::getSEW(VType);
TailAgnostic = RISCVVType::isTailAgnostic(VType);
MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
+ AltFmt = RISCVVType::isAltFmt(VType);
+ TWiden =
+ RISCVVType::hasXSfmmWiden(VType) ? RISCVVType::getXSfmmWiden(VType) : 0;
}
- void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
+ void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA, bool Altfmt,
+ unsigned W) {
assert(isValid() && !isUnknown() &&
"Can't set VTYPE for uninitialized or unknown");
VLMul = L;
SEW = S;
TailAgnostic = TA;
MaskAgnostic = MA;
+ AltFmt = Altfmt;
+ TWiden = W;
}
+ void setAltFmt(bool AF) { AltFmt = AF; }
+
void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
unsigned encodeVTYPE() const {
assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
"Can't encode VTYPE for uninitialized or unknown");
+ if (TWiden != 0)
+ return RISCVVType::encodeXSfmmVType(SEW, TWiden, AltFmt);
return RISCVVType::encodeVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
}
@@ -674,9 +711,9 @@ public:
"Can't compare VTYPE in unknown state");
assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
"Can't compare when only LMUL/SEW ratio is valid.");
- return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic) ==
+ return std::tie(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden) ==
std::tie(Other.VLMul, Other.SEW, Other.TailAgnostic,
- Other.MaskAgnostic);
+ Other.MaskAgnostic, Other.AltFmt, Other.TWiden);
}
unsigned getSEWLMULRatio() const {
@@ -825,7 +862,9 @@ public:
<< "SEW=e" << (unsigned)SEW << ", "
<< "TailAgnostic=" << (bool)TailAgnostic << ", "
<< "MaskAgnostic=" << (bool)MaskAgnostic << ", "
- << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << "}";
+ << "SEWLMULRatioOnly=" << (bool)SEWLMULRatioOnly << ", "
+ << "TWiden=" << (unsigned)TWiden << ", "
+ << "AltFmt=" << (bool)AltFmt << "}";
}
#endif
};
@@ -853,6 +892,11 @@ struct BlockData {
BlockData() = default;
};
+enum TKTMMode {
+ VSETTK = 0,
+ VSETTM = 1,
+};
+
class RISCVInsertVSETVLI : public MachineFunctionPass {
const RISCVSubtarget *ST;
const TargetInstrInfo *TII;
@@ -908,6 +952,7 @@ private:
VSETVLIInfo getInfoForVSETVLI(const MachineInstr &MI) const;
VSETVLIInfo computeInfoForInstr(const MachineInstr &MI) const;
void forwardVSETVLIAVL(VSETVLIInfo &Info) const;
+ bool insertVSETMTK(MachineBasicBlock &MBB, TKTMMode Mode) const;
};
} // end anonymous namespace
@@ -945,6 +990,18 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
VSETVLIInfo NewInfo;
if (MI.getOpcode() == RISCV::PseudoVSETIVLI) {
NewInfo.setAVLImm(MI.getOperand(1).getImm());
+ } else if (RISCVInstrInfo::isXSfmmVectorConfigTNInstr(MI)) {
+ assert(MI.getOpcode() == RISCV::PseudoSF_VSETTNT ||
+ MI.getOpcode() == RISCV::PseudoSF_VSETTNTX0);
+ switch (MI.getOpcode()) {
+ case RISCV::PseudoSF_VSETTNTX0:
+ NewInfo.setAVLVLMAX();
+ break;
+ case RISCV::PseudoSF_VSETTNT:
+ Register ATNReg = MI.getOperand(1).getReg();
+ NewInfo.setAVLRegDef(getVNInfoFromReg(ATNReg, MI, LIS), ATNReg);
+ break;
+ }
} else {
assert(MI.getOpcode() == RISCV::PseudoVSETVLI ||
MI.getOpcode() == RISCV::PseudoVSETVLIX0);
@@ -1005,11 +1062,34 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+ bool AltFmt = RISCVII::getAltFmtType(TSFlags) == RISCVII::AltFmtType::AltFmt;
+ InstrInfo.setAltFmt(AltFmt);
+
unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
// A Log2SEW of 0 is an operation on mask registers only.
unsigned SEW = Log2SEW ? 1 << Log2SEW : 8;
assert(RISCVVType::isValidSEW(SEW) && "Unexpected SEW");
+ if (RISCVII::hasTWidenOp(TSFlags)) {
+ const MachineOperand &TWidenOp =
+ MI.getOperand(MI.getNumExplicitOperands() - 1);
+ unsigned TWiden = TWidenOp.getImm();
+
+ InstrInfo.setAVLVLMAX();
+ if (RISCVII::hasVLOp(TSFlags)) {
+ const MachineOperand &TNOp =
+ MI.getOperand(RISCVII::getTNOpNum(MI.getDesc()));
+
+ if (TNOp.getReg().isVirtual())
+ InstrInfo.setAVLRegDef(getVNInfoFromReg(TNOp.getReg(), MI, LIS),
+ TNOp.getReg());
+ }
+
+ InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt, TWiden);
+
+ return InstrInfo;
+ }
+
if (RISCVII::hasVLOp(TSFlags)) {
const MachineOperand &VLOp = MI.getOperand(getVLOpNum(MI));
if (VLOp.isImm()) {
@@ -1045,7 +1125,9 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
assert(SEW == EEW && "Initial SEW doesn't match expected EEW");
}
#endif
- InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic);
+ // TODO: Propagate the twiden from previous vtype for potential reuse.
+ InstrInfo.setVTYPE(VLMul, SEW, TailAgnostic, MaskAgnostic, AltFmt,
+ /*TWiden*/ 0);
forwardVSETVLIAVL(InstrInfo);
@@ -1053,10 +1135,33 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
}
void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator InsertPt, DebugLoc DL,
- const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
-
+ MachineBasicBlock::iterator InsertPt,
+ DebugLoc DL, const VSETVLIInfo &Info,
+ const VSETVLIInfo &PrevInfo) {
++NumInsertedVSETVL;
+
+ if (Info.getTWiden()) {
+ if (Info.hasAVLVLMAX()) {
+ Register DestReg = MRI->createVirtualRegister(&RISCV::GPRNoX0RegClass);
+ auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNTX0))
+ .addReg(DestReg, RegState::Define | RegState::Dead)
+ .addReg(RISCV::X0, RegState::Kill)
+ .addImm(Info.encodeVTYPE());
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MI);
+ LIS->createAndComputeVirtRegInterval(DestReg);
+ }
+ } else {
+ auto MI = BuildMI(MBB, InsertPt, DL, TII->get(RISCV::PseudoSF_VSETTNT))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(Info.getAVLReg())
+ .addImm(Info.encodeVTYPE());
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MI);
+ }
+ return;
+ }
+
if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
// Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
// VLMAX.
@@ -1198,7 +1303,8 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
// be coalesced into another vsetvli since we won't demand any fields.
VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
NewInfo.setAVLImm(1);
- NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
+ NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true,
+ /*AltFmt*/ false, /*W*/ 0);
Info = NewInfo;
return;
}
@@ -1240,7 +1346,9 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
(Demanded.TailPolicy ? IncomingInfo : Info).getTailAgnostic() ||
IncomingInfo.getTailAgnostic(),
(Demanded.MaskPolicy ? IncomingInfo : Info).getMaskAgnostic() ||
- IncomingInfo.getMaskAgnostic());
+ IncomingInfo.getMaskAgnostic(),
+ (Demanded.AltFmt ? IncomingInfo : Info).getAltFmt(),
+ Demanded.TWiden ? IncomingInfo.getTWiden() : 0);
// If we only knew the sew/lmul ratio previously, replace the VTYPE but keep
// the AVL.
@@ -1293,7 +1401,8 @@ bool RISCVInsertVSETVLI::computeVLVTYPEChanges(const MachineBasicBlock &MBB,
if (RISCVInstrInfo::isVectorConfigInstr(MI) ||
RISCVII::hasSEWOp(MI.getDesc().TSFlags) ||
- isVectorCopy(ST->getRegisterInfo(), MI))
+ isVectorCopy(ST->getRegisterInfo(), MI) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI))
HadVectorOp = true;
transferAfter(Info, MI);
@@ -1675,6 +1784,12 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
};
for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
+ // TODO: Support XSfmm.
+ if (RISCVII::hasTWidenOp(MI.getDesc().TSFlags) ||
+ RISCVInstrInfo::isXSfmmVectorConfigInstr(MI)) {
+ NextMI = nullptr;
+ continue;
+ }
if (!RISCVInstrInfo::isVectorConfigInstr(MI)) {
Used.doUnion(getDemanded(MI, ST));
@@ -1788,6 +1903,65 @@ void RISCVInsertVSETVLI::insertReadVL(MachineBasicBlock &MBB) {
}
}
+bool RISCVInsertVSETVLI::insertVSETMTK(MachineBasicBlock &MBB,
+ TKTMMode Mode) const {
+
+ bool Changed = false;
+ for (auto &MI : MBB) {
+ uint64_t TSFlags = MI.getDesc().TSFlags;
+ if (RISCVInstrInfo::isXSfmmVectorConfigTMTKInstr(MI) ||
+ !RISCVII::hasSEWOp(TSFlags) || !RISCVII::hasTWidenOp(TSFlags))
+ continue;
+
+ VSETVLIInfo CurrInfo = computeInfoForInstr(MI);
+
+ if (Mode == VSETTK && !RISCVII::hasTKOp(TSFlags))
+ continue;
+
+ if (Mode == VSETTM && !RISCVII::hasTMOp(TSFlags))
+ continue;
+
+ unsigned OpNum = 0;
+ unsigned Opcode = 0;
+ switch (Mode) {
+ case VSETTK:
+ OpNum = RISCVII::getTKOpNum(MI.getDesc());
+ Opcode = RISCV::PseudoSF_VSETTK;
+ break;
+ case VSETTM:
+ OpNum = RISCVII::getTMOpNum(MI.getDesc());
+ Opcode = RISCV::PseudoSF_VSETTM;
+ break;
+ }
+
+ assert(OpNum && Opcode && "Invalid OpNum or Opcode");
+
+ MachineOperand &Op = MI.getOperand(OpNum);
+
+ auto TmpMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opcode))
+ .addReg(RISCV::X0, RegState::Define | RegState::Dead)
+ .addReg(Op.getReg())
+ .addImm(Log2_32(CurrInfo.getSEW()))
+ .addImm(Log2_32(CurrInfo.getTWiden()) + 1);
+
+ Changed = true;
+ Register Reg = Op.getReg();
+ Op.setReg(Register());
+ Op.setIsKill(false);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*TmpMI);
+ LiveInterval &LI = LIS->getInterval(Reg);
+
+ // Erase the AVL operand from the instruction.
+ LIS->shrinkToUses(&LI);
+ // TODO: Enable this once needVSETVLIPHI is supported.
+ // SmallVector<LiveInterval *> SplitLIs;
+ // LIS->splitSeparateComponents(LI, SplitLIs);
+ }
+ }
+ return Changed;
+}
+
bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
// Skip if the vector extension is not enabled.
ST = &MF.getSubtarget<RISCVSubtarget>();
@@ -1865,6 +2039,11 @@ bool RISCVInsertVSETVLI::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF)
insertReadVL(MBB);
+ for (MachineBasicBlock &MBB : MF) {
+ insertVSETMTK(MBB, VSETTM);
+ insertVSETMTK(MBB, VSETTK);
+ }
+
BlockInfo.clear();
return HaveVectorOp;
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 2afd77a..5b06303 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -267,6 +267,22 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
// operands' VLs.
bit ReadsPastVL = 0;
let TSFlags{26} = ReadsPastVL;
+
+ // 0 -> Don't care about altfmt bit in VTYPE.
+ // 1 -> Is not altfmt.
+ // 2 -> Is altfmt(BF16).
+ bits<2> AltFmtType = 0;
+ let TSFlags{28-27} = AltFmtType;
+
+ // XSfmmbase
+ bit HasTWidenOp = 0;
+ let TSFlags{29} = HasTWidenOp;
+
+ bit HasTmOp = 0;
+ let TSFlags{30} = HasTmOp;
+
+ bit HasTkOp = 0;
+ let TSFlags{31} = HasTkOp;
}
class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 96e1078..ddb53a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3005,6 +3005,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
else
Ok = RISCVFPRndMode::isValidRoundingMode(Imm);
break;
+ case RISCVOp::OPERAND_XSFMM_VTYPE:
+ Ok = RISCVVType::isValidXSfmmVType(Imm);
+ break;
}
if (!Ok) {
ErrInfo = "Invalid immediate";
@@ -3670,6 +3673,11 @@ std::string RISCVInstrInfo::createMIROperandComment(
RISCVVType::printVType(Imm, OS);
break;
}
+ case RISCVOp::OPERAND_XSFMM_VTYPE: {
+ unsigned Imm = Op.getImm();
+ RISCVVType::printXSfmmVType(Imm, OS);
+ break;
+ }
case RISCVOp::OPERAND_SEW:
case RISCVOp::OPERAND_SEW_MASK: {
unsigned Log2SEW = Op.getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 298d35a..c1b23af 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -128,6 +128,9 @@ defvar TAIL_AGNOSTIC = 1;
defvar TU_MU = 0;
defvar TA_MU = 1;
defvar TA_MA = 3;
+defvar DONT_CARE_ALTFMT = 0;
+defvar IS_NOT_ALTFMT = 1;
+defvar IS_ALTFMT = 2;
//===----------------------------------------------------------------------===//
// Utilities.
@@ -159,7 +162,8 @@ class PseudoToVInst<string PseudoInst> {
["_M4", ""],
["_M8", ""],
["_SE", ""],
- ["_RM", ""]
+ ["_RM", ""],
+ ["_ALT", ""]
];
string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
!subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -6396,7 +6400,7 @@ let Defs = [VXSAT] in {
// 13. Vector Floating-Point Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
//===----------------------------------------------------------------------===//
// 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions
//===----------------------------------------------------------------------===//
@@ -6565,7 +6569,7 @@ defm PseudoVFNCVT_F_F : VPseudoVNCVTD_W_RM;
defm PseudoVFNCVT_ROD_F_F : VPseudoVNCVTD_W;
} // mayRaiseFPException = true
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 14. Vector Reduction Operations
@@ -6593,7 +6597,7 @@ defm PseudoVWREDSUM : VPseudoVWRED_VS;
}
} // Predicates = [HasVInstructions]
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
//===----------------------------------------------------------------------===//
// 14.3. Vector Single-Width Floating-Point Reduction Instructions
//===----------------------------------------------------------------------===//
@@ -6612,7 +6616,7 @@ defm PseudoVFWREDUSUM : VPseudoVFWRED_VS_RM;
defm PseudoVFWREDOSUM : VPseudoVFWREDO_VS_RM;
}
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 15. Vector Mask Instructions
@@ -6703,7 +6707,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
// 16.2. Floating-Point Scalar Move Instructions
//===----------------------------------------------------------------------===//
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
foreach f = FPList in {
let HasSEWOp = 1, BaseInstr = VFMV_F_S in
@@ -6718,7 +6722,7 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
Sched<[WriteVMovSF, ReadVMovSF_V, ReadVMovSF_F]>;
}
}
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 16.3. Vector Slide Instructions
@@ -6730,10 +6734,10 @@ let Predicates = [HasVInstructions] in {
defm PseudoVSLIDE1DOWN : VPseudoVSLD1_VX;
} // Predicates = [HasVInstructions]
-let Predicates = [HasVInstructionsAnyF] in {
+let Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT in {
defm PseudoVFSLIDE1UP : VPseudoVSLD1_VF<"@earlyclobber $rd">;
defm PseudoVFSLIDE1DOWN : VPseudoVSLD1_VF;
-} // Predicates = [HasVInstructionsAnyF]
+} // Predicates = [HasVInstructionsAnyF], AltFmtType = IS_NOT_ALTFMT
//===----------------------------------------------------------------------===//
// 16.4. Vector Register Gather Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 557d873..6a4119a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -438,8 +438,10 @@ let Predicates = [HasVendorXSfvcp] in {
}
foreach f = FPList in {
foreach m = f.MxList in {
- defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
- defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+ let AltFmtType = IS_NOT_ALTFMT in {
+ defm f.FX # "V" : VPseudoVC_XV<m, f.fprclass, payload1>;
+ defm f.FX # "VV" : VPseudoVC_XVV<m, f.fprclass, payload1>;
+ }
}
}
foreach m = MxListW in {
@@ -449,7 +451,8 @@ let Predicates = [HasVendorXSfvcp] in {
}
foreach f = FPListW in {
foreach m = f.MxList in
- defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
+ let AltFmtType = IS_NOT_ALTFMT in
+ defm f.FX # "VW" : VPseudoVC_XVW<m, f.fprclass, payload1>;
}
}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
index a5ee701..5ad22e6b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSfmm.td
@@ -225,7 +225,7 @@ let Predicates = [HasVendorXSfmmbase] in {
def SF_VSETTM : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00001,
"sf.vsettm", "$rd, $rs1">;
def SF_VSETTK : SFInstSetSingle<(outs GPR:$rd), (ins GPR:$rs1), 0b00010,
- "sf.vsettk", "$rd, $rs1">;
+ "sf.vsettk", "$rd, $rs1">;
def SF_VTDISCARD : SFInstVtDiscard<"sf.vtdiscard">;
def SF_VTMV_V_T : SFInstTileMoveOp<0b010000, (outs VR:$vd), (ins GPR:$rs1),
@@ -277,3 +277,144 @@ let Uses = [FRM], mayRaiseFPException = true in {
} // Predicates = [HasVendorXSfmm32a8f]
} // DecoderNamespace = "XSfvector"
+
+class VPseudoSF_VTileLoad
+ : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 1;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileStore
+ : RISCVVPseudo<(outs), (ins GPR:$rs2, GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 1;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_V_T
+ : RISCVVPseudo<(outs VRM8:$vd), (ins GPR:$rs1, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_VTileMove_T_V
+ : RISCVVPseudo<(outs), (ins GPR:$rs1, VRM8:$vs2, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasVLOp = 1; // Tn
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul<RegisterClass mtd_class>
+ : RISCVVPseudo<(outs),
+ (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, AVL:$atm, AVL:$atn,
+ AVL:$atk, ixlenimm:$sew, ixlenimm:$twiden)> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasTmOp = 1;
+ let HasVLOp = 1; // Tn
+ let HasTkOp = 1;
+ let HasSEWOp = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+}
+
+class VPseudoSF_MatMul_FRM<RegisterClass mtd_class>
+ : RISCVVPseudo<(outs),
+ (ins mtd_class:$rd, VRM8:$vs2, VRM8:$vs1, ixlenimm:$frm,
+ AVL:$atm, AVL:$atn, AVL:$atk, ixlenimm:$sew,
+ ixlenimm:$twiden), []> {
+ let mayLoad = 0;
+ let mayStore = 0;
+ let HasTmOp = 1;
+ let HasVLOp = 1; // Tn
+ let HasTkOp = 1;
+ let HasSEWOp = 1;
+ let HasRoundModeOp = 1;
+ let hasPostISelHook = 1;
+ let HasTWidenOp = 1;
+ let hasSideEffects = 1;
+ let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+let Defs = [VL, VTYPE] in {
+ def PseudoSF_VSETTNT
+ : Pseudo<(outs GPR:$rd),
+ (ins GPRNoX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTNTX0
+ : Pseudo<(outs GPRNoX0:$rd),
+ (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTNTX0X0
+ : Pseudo<(outs GPRX0:$rd),
+ (ins GPRX0:$rs1, XSfmmVTypeOp:$vtypei), []>,
+ PseudoInstExpansion<(VSETVLI GPR:$rd, GPR:$rs1, VTypeIOp11:$vtypei)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+
+let Defs = [VTYPE], Uses = [VTYPE], HasTWidenOp = 1, HasSEWOp = 1 in {
+ def PseudoSF_VSETTM
+ : Pseudo<(outs GPR:$rd),
+ (ins GPR:$rs1, ixlenimm:$log2sew, ixlenimm:$twiden), []>,
+ PseudoInstExpansion<(SF_VSETTM GPR:$rd, GPR:$rs1)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+ def PseudoSF_VSETTK
+ : Pseudo<(outs GPR:$rd),
+ (ins GPR:$rs1, ixlenimm:$logwsew, ixlenimm:$twiden), []>,
+ PseudoInstExpansion<(SF_VSETTK GPR:$rd, GPR:$rs1)>,
+ Sched<[WriteVSETVLI, ReadVSETVLI]>;
+}
+}
+
+foreach eew = [8, 16, 32, 64] in {
+ def PseudoSF_VLTE # eew : VPseudoSF_VTileLoad;
+ def PseudoSF_VSTE # eew : VPseudoSF_VTileStore;
+}
+
+def PseudoSF_VTMV_T_V : VPseudoSF_VTileMove_T_V;
+def PseudoSF_VTMV_V_T : VPseudoSF_VTileMove_V_T;
+
+foreach a = I8Encodes in
+ foreach b = I8Encodes in
+ def PseudoSF_MM_ # !toupper(a.Name) # _ # !toupper(b.Name)
+ : VPseudoSF_MatMul<TRM4>;
+
+let AltFmtType = IS_NOT_ALTFMT in
+ def PseudoSF_MM_F_F : VPseudoSF_MatMul_FRM<TRM2>;
+let AltFmtType = IS_ALTFMT in
+ def PseudoSF_MM_F_F_ALT : VPseudoSF_MatMul_FRM<TRM2>;
+
+foreach e1 = [5, 4] in
+ foreach e2 = [5, 4] in
+ def PseudoSF_MM_E # e1 # M # !sub(7, e1) # _E # e2 # M # !sub(7, e2)
+ : VPseudoSF_MatMul_FRM<TRM4>;
+
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+ let HasVLOp = 1, HasTmOp = 1, HasTWidenOp = 1, HasSEWOp = 1 in
+ def PseudoSF_VTZERO_T
+ : RISCVVPseudo<(outs),
+ (ins TR:$rd, AVL:$atm, AVL:$atn, ixlenimm:$sew,
+ ixlenimm:$twiden)>;
+ def PseudoSF_VTDISCARD : RISCVVPseudo<(outs), (ins), []>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
index 3658817..dcae977 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrPredicates.td
@@ -78,7 +78,41 @@ def isVectorConfigInstr
PseudoVSETVLI,
PseudoVSETVLIX0,
PseudoVSETVLIX0X0,
- PseudoVSETIVLI
+ PseudoVSETIVLI,
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0
+ ]>>>;
+
+// Returns true if this is a PseudoSF_VSETTNT* instructions.
+def isXSfmmVectorConfigTNInstr
+ : TIIPredicate<"isXSfmmVectorConfigTNInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0
+ ]>>>;
+
+// Returns true if this is PseudoSF_VSETTM or PseudoSF_VSETTK.
+def isXSfmmVectorConfigTMTKInstr
+ : TIIPredicate<"isXSfmmVectorConfigTMTKInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTM,
+ PseudoSF_VSETTK
+ ]>>>;
+
+// Returns true if this is a XSfmm vector configuration instruction.
+def isXSfmmVectorConfigInstr
+ : TIIPredicate<"isXSfmmVectorConfigInstr",
+ MCReturnStatement<
+ CheckOpcode<[
+ PseudoSF_VSETTNT,
+ PseudoSF_VSETTNTX0,
+ PseudoSF_VSETTNTX0X0,
+ PseudoSF_VSETTM,
+ PseudoSF_VSETTK
]>>>;
// Return true if this is 'vsetvli x0, x0, vtype' which preserves
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 40b6416..e9f43b9 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -178,6 +178,10 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Shadow stack pointer.
markSuperRegs(Reserved, RISCV::SSP);
+ // XSfmmbase
+ for (MCPhysReg Reg = RISCV::T0; Reg <= RISCV::T15; Reg++)
+ markSuperRegs(Reserved, Reg);
+
assert(checkAllSuperRegsMarked(Reserved));
return Reserved;
}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6472334..47c24fc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -317,6 +317,15 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, T, Custom);
}
+ if (Subtarget->hasFP16()) {
+ setOperationAction(ISD::FMA, MVT::v8f16, Legal);
+ }
+
+ if (Subtarget->hasRelaxedSIMD()) {
+ setOperationAction(ISD::FMULADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::FMULADD, MVT::v2f64, Legal);
+ }
+
// Partial MLA reductions.
for (auto Op : {ISD::PARTIAL_REDUCE_SMLA, ISD::PARTIAL_REDUCE_UMLA}) {
setPartialReduceMLAAction(Op, MVT::v4i32, MVT::v16i8, Legal);
@@ -1120,6 +1129,18 @@ WebAssemblyTargetLowering::getPreferredVectorAction(MVT VT) const {
return TargetLoweringBase::getPreferredVectorAction(VT);
}
+bool WebAssemblyTargetLowering::isFMAFasterThanFMulAndFAdd(
+ const MachineFunction &MF, EVT VT) const {
+ if (!Subtarget->hasFP16() || !VT.isVector())
+ return false;
+
+ EVT ScalarVT = VT.getScalarType();
+ if (!ScalarVT.isSimple())
+ return false;
+
+ return ScalarVT.getSimpleVT().SimpleTy == MVT::f16;
+}
+
bool WebAssemblyTargetLowering::shouldSimplifyDemandedVectorElts(
SDValue Op, const TargetLoweringOpt &TLO) const {
// ISel process runs DAGCombiner after legalization; this step is called
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index b33a853..472ec67 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -81,6 +81,8 @@ private:
TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 49af78b..0f6e1ca 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1213,6 +1213,27 @@ defm EXTMUL_LOW_U :
defm EXTMUL_HIGH_U :
SIMDExtBinary<I64x2, extmul_high_u, "extmul_high_i32x4_u", 0xdf>;
+// Pattern for i32x4.dot_i16x8_s
+def : Pat<
+ (v4i32 (add
+ (wasm_shuffle
+ (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+ (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+ (i32 0), (i32 1), (i32 2), (i32 3),
+ (i32 8), (i32 9), (i32 10), (i32 11),
+ (i32 16), (i32 17), (i32 18), (i32 19),
+ (i32 24), (i32 25), (i32 26), (i32 27)),
+ (wasm_shuffle
+ (v4i32 (extmul_low_s v8i16:$lhs, v8i16:$rhs)),
+ (v4i32 (extmul_high_s v8i16:$lhs, v8i16:$rhs)),
+ (i32 4), (i32 5), (i32 6), (i32 7),
+ (i32 12), (i32 13), (i32 14), (i32 15),
+ (i32 20), (i32 21), (i32 22), (i32 23),
+ (i32 28), (i32 29), (i32 30), (i32 31)))
+ ),
+ (v4i32 (DOT v8i16:$lhs, v8i16:$rhs))
+>;
+
//===----------------------------------------------------------------------===//
// Floating-point unary arithmetic
//===----------------------------------------------------------------------===//
@@ -1626,7 +1647,8 @@ defm "" : RelaxedConvert<I32x4, F64x2, int_wasm_relaxed_trunc_unsigned_zero,
// Relaxed (Negative) Multiply-Add (madd/nmadd)
//===----------------------------------------------------------------------===//
-multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate> reqs> {
+multiclass RELAXED_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+ list<Predicate> reqs> {
defm MADD_#vec :
SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
[(set (vec.vt V128:$dst), (int_wasm_relaxed_madd
@@ -1640,16 +1662,46 @@ multiclass SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS, list<Predicate>
vec.prefix#".relaxed_nmadd\t$dst, $a, $b, $c",
vec.prefix#".relaxed_nmadd", simdopS, reqs>;
- def : Pat<(fadd_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
- (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fadd_contract (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b)), (vec.vt V128:$c)),
+ (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+ def : Pat<(fmuladd (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)),
+ (!cast<Instruction>("MADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
- def : Pat<(fsub_contract (vec.vt V128:$a), (fmul_contract (vec.vt V128:$b), (vec.vt V128:$c))),
- (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<[HasRelaxedSIMD]>;
+ def : Pat<(fsub_contract (vec.vt V128:$c), (fmul_contract (vec.vt V128:$a), (vec.vt V128:$b))),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
+ def : Pat<(fmuladd (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)),
+ (!cast<Instruction>("NMADD_"#vec) V128:$a, V128:$b, V128:$c)>, Requires<reqs>;
}
-defm "" : SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
-defm "" : SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+defm "" : RELAXED_SIMDMADD<F32x4, 0x105, 0x106, [HasRelaxedSIMD]>;
+defm "" : RELAXED_SIMDMADD<F64x2, 0x107, 0x108, [HasRelaxedSIMD]>;
+
+//===----------------------------------------------------------------------===//
+// FP16 (Negative) Multiply-Add (madd/nmadd)
+//===----------------------------------------------------------------------===//
+
+multiclass HALF_PRECISION_SIMDMADD<Vec vec, bits<32> simdopA, bits<32> simdopS,
+ list<Predicate> reqs> {
+ defm MADD_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (fma
+ (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".madd\t$dst, $a, $b, $c",
+ vec.prefix#".madd", simdopA, reqs>;
+ defm NMADD_#vec :
+ SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), (outs), (ins),
+ [(set (vec.vt V128:$dst), (fma
+ (fneg (vec.vt V128:$a)), (vec.vt V128:$b), (vec.vt V128:$c)))],
+ vec.prefix#".nmadd\t$dst, $a, $b, $c",
+ vec.prefix#".nmadd", simdopS, reqs>;
+}
+defm "" : HALF_PRECISION_SIMDMADD<F16x8, 0x14e, 0x14f, [HasFP16]>;
+
+// TODO: I think separate intrinsics should be introduced for these FP16 operations.
+def : Pat<(v8f16 (int_wasm_relaxed_madd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+ (MADD_F16x8 V128:$a, V128:$b, V128:$c)>;
+def : Pat<(v8f16 (int_wasm_relaxed_nmadd (v8f16 V128:$a), (v8f16 V128:$b), (v8f16 V128:$c))),
+ (NMADD_F16x8 V128:$a, V128:$b, V128:$c)>;
//===----------------------------------------------------------------------===//
// Laneselect
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index acf8e4c..5ea63a9 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -228,6 +228,10 @@ void printVType(unsigned VType, raw_ostream &OS) {
OS << ", mu";
}
+void printXSfmmVType(unsigned VType, raw_ostream &OS) {
+ OS << "e" << getSEW(VType) << ", w" << getXSfmmWiden(VType);
+}
+
unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
unsigned LMul;
bool Fractional;
diff --git a/llvm/lib/Transforms/Coroutines/CoroCloner.h b/llvm/lib/Transforms/Coroutines/CoroCloner.h
index 26ec4f3..e05fe28 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCloner.h
+++ b/llvm/lib/Transforms/Coroutines/CoroCloner.h
@@ -1,3 +1,4 @@
+//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -19,9 +20,7 @@
#include "llvm/Transforms/Coroutines/CoroInstr.h"
#include "llvm/Transforms/Utils/ValueMapper.h"
-namespace llvm {
-
-namespace coro {
+namespace llvm::coro {
enum class CloneKind {
/// The shared resume function for a switch lowering.
@@ -149,8 +148,6 @@ public:
}
};
-} // end namespace coro
-
-} // end namespace llvm
+} // end namespace llvm::coro
#endif // LLVM_LIB_TRANSFORMS_COROUTINES_COROCLONER_H
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 471b9eb..cdb5852 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -38,7 +38,7 @@ public:
AnyResumeFnPtrTy(PointerType::getUnqual(Context)) {}
void lowerEarlyIntrinsics(Function &F);
};
-}
+} // namespace
// Replace a direct call to coro.resume or coro.destroy with an indirect call to
// an address returned by coro.subfn.addr intrinsic. This is done so that
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 52f4ffe..cc47a55 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -16,11 +16,7 @@
#include "llvm/Transforms/Coroutines/CoroInstr.h"
#include "llvm/Transforms/Coroutines/CoroShape.h"
-namespace llvm {
-
-class CallGraph;
-
-namespace coro {
+namespace llvm::coro {
bool isSuspendBlock(BasicBlock *BB);
bool declaresAnyIntrinsic(const Module &M);
@@ -61,7 +57,6 @@ void normalizeCoroutine(Function &F, coro::Shape &Shape,
CallInst *createMustTailCall(DebugLoc Loc, Function *MustTailCallFn,
TargetTransformInfo &TTI,
ArrayRef<Value *> Arguments, IRBuilder<> &);
-} // End namespace coro.
-} // End namespace llvm
+} // End namespace llvm::coro
#endif
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index 6aaabca..f2444da 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -137,8 +137,7 @@ struct RematGraph {
} // namespace
-namespace llvm {
-template <> struct GraphTraits<RematGraph *> {
+template <> struct llvm::GraphTraits<RematGraph *> {
using NodeRef = RematGraph::RematNode *;
using ChildIteratorType = RematGraph::RematNode **;
@@ -149,8 +148,6 @@ template <> struct GraphTraits<RematGraph *> {
static ChildIteratorType child_end(NodeRef N) { return N->Operands.end(); }
};
-} // end namespace llvm
-
// For each instruction identified as materializable across the suspend point,
// and its associated DAG of other rematerializable instructions,
// recreate the DAG of instructions after the suspend point.
diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
index e474c07..81fe0c9 100644
--- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp
@@ -16,11 +16,8 @@
#include "llvm/IR/InstIterator.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-namespace llvm {
-
-namespace coro {
-
-namespace {
+using namespace llvm;
+using namespace llvm::coro;
typedef SmallPtrSet<BasicBlock *, 8> VisitedBlocksSet;
@@ -71,7 +68,7 @@ static bool isLocalAlloca(CoroAllocaAllocInst *AI) {
/// This happens during the all-instructions iteration, so it must not
/// delete the call.
static Instruction *
-lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const coro::Shape &Shape,
+lowerNonLocalAlloca(CoroAllocaAllocInst *AI, const Shape &Shape,
SmallVectorImpl<Instruction *> &DeadInsts) {
IRBuilder<> Builder(AI);
auto Alloc = Shape.emitAlloc(Builder, AI->getSize(), nullptr);
@@ -450,10 +447,8 @@ static void collectFrameAlloca(AllocaInst *AI, const coro::Shape &Shape,
Visitor.getMayWriteBeforeCoroBegin());
}
-} // namespace
-
-void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
- const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromArgs(SpillInfo &Spills, Function &F,
+ const SuspendCrossingInfo &Checker) {
// Collect the spills for arguments and other not-materializable values.
for (Argument &A : F.args())
for (User *U : A.users())
@@ -461,7 +456,7 @@ void collectSpillsFromArgs(SpillInfo &Spills, Function &F,
Spills[&A].push_back(cast<Instruction>(U));
}
-void collectSpillsAndAllocasFromInsts(
+void coro::collectSpillsAndAllocasFromInsts(
SpillInfo &Spills, SmallVector<AllocaInfo, 8> &Allocas,
SmallVector<Instruction *, 4> &DeadInstructions,
SmallVector<CoroAllocaAllocInst *, 4> &LocalAllocas, Function &F,
@@ -516,8 +511,8 @@ void collectSpillsAndAllocasFromInsts(
}
}
-void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
- const SuspendCrossingInfo &Checker) {
+void coro::collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
+ const SuspendCrossingInfo &Checker) {
// We don't want the layout of coroutine frame to be affected
// by debug information. So we only choose to salvage dbg.values for
// whose value is already in the frame.
@@ -535,10 +530,9 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F,
/// Async and Retcon{Once} conventions assume that all spill uses can be sunk
/// after the coro.begin intrinsic.
-void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
- CoroBeginInst *CoroBegin,
- coro::SpillInfo &Spills,
- SmallVectorImpl<coro::AllocaInfo> &Allocas) {
+void coro::sinkSpillUsesAfterCoroBegin(
+ const DominatorTree &Dom, CoroBeginInst *CoroBegin, coro::SpillInfo &Spills,
+ SmallVectorImpl<coro::AllocaInfo> &Allocas) {
SmallSetVector<Instruction *, 32> ToMove;
SmallVector<Instruction *, 32> Worklist;
@@ -582,8 +576,9 @@ void sinkSpillUsesAfterCoroBegin(const DominatorTree &Dom,
Inst->moveBefore(InsertPt->getIterator());
}
-BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
- const DominatorTree &DT) {
+BasicBlock::iterator coro::getSpillInsertionPt(const coro::Shape &Shape,
+ Value *Def,
+ const DominatorTree &DT) {
BasicBlock::iterator InsertPt;
if (auto *Arg = dyn_cast<Argument>(Def)) {
// For arguments, we will place the store instruction right after
@@ -625,7 +620,3 @@ BasicBlock::iterator getSpillInsertionPt(const coro::Shape &Shape, Value *Def,
return InsertPt;
}
-
-} // End namespace coro.
-
-} // End namespace llvm.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7071876..943c223 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -471,7 +471,6 @@ private:
Value *simplifyNonNullOperand(Value *V, bool HasDereferenceable,
unsigned Depth = 0);
-public:
/// Create `select C, S1, S2`. Use only when the profile cannot be calculated
/// from existing profile metadata: if the Function has profiles, this will
/// set the profile of this select to "unknown".
@@ -484,6 +483,7 @@ public:
return Sel;
}
+public:
/// Create and insert the idiom we use to indicate a block is unreachable
/// without having to rewrite the CFG from within InstCombine.
void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 63e24a0..a330bb7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -110,8 +110,8 @@ static Value *simplifyShiftSelectingPackedElement(Instruction *I,
ShrAmt->getName() + ".z");
// There is no existing !prof metadata we can derive the !prof metadata for
// this select.
- Value *Select = IC.createSelectInstWithUnknownProfile(ShrAmtZ, Lower, Upper);
- IC.Builder.Insert(Select);
+ Value *Select = IC.Builder.CreateSelectWithUnknownProfile(ShrAmtZ, Lower,
+ Upper, DEBUG_TYPE);
Select->takeName(I);
return Select;
}
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 82ac903..3f11cae 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1690,6 +1690,11 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
// 2) (fp_binop ({s|u}itofp x), FpC)
// -> ({s|u}itofp (int_binop x, (fpto{s|u}i FpC)))
Instruction *InstCombinerImpl::foldFBinOpOfIntCasts(BinaryOperator &BO) {
+ // Don't perform the fold on vectors, as the integer operation may be much
+ // more expensive than the float operation in that case.
+ if (BO.getType()->isVectorTy())
+ return nullptr;
+
std::array<Value *, 2> IntOps = {nullptr, nullptr};
Constant *Op1FpC = nullptr;
// Check for:
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
index c86092b..a6ec6c1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfUse.cpp
@@ -17,6 +17,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/MemoryProfileInfo.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StaticDataProfileInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
@@ -194,6 +195,30 @@ static bool isAllocationWithHotColdVariant(const Function *Callee,
}
}
+static void HandleUnsupportedAnnotationKinds(GlobalVariable &GVar,
+ AnnotationKind Kind) {
+ assert(Kind != llvm::memprof::AnnotationKind::AnnotationOK &&
+ "Should not handle AnnotationOK here");
+ SmallString<32> Reason;
+ switch (Kind) {
+ case llvm::memprof::AnnotationKind::ExplicitSection:
+ ++NumOfMemProfExplicitSectionGlobalVars;
+ Reason.append("explicit section name");
+ break;
+ case llvm::memprof::AnnotationKind::DeclForLinker:
+ Reason.append("linker declaration");
+ break;
+ case llvm::memprof::AnnotationKind::ReservedName:
+ Reason.append("name starts with `llvm.`");
+ break;
+ default:
+ llvm_unreachable("Unexpected annotation kind");
+ }
+ LLVM_DEBUG(dbgs() << "Skip annotation for " << GVar.getName() << " due to "
+ << Reason << ".\n");
+ return;
+}
+
struct AllocMatchInfo {
uint64_t TotalSize = 0;
AllocationType AllocType = AllocationType::None;
@@ -775,29 +800,13 @@ PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
return PreservedAnalyses::none();
}
-// Returns true iff the global variable has custom section either by
-// __attribute__((section("name")))
-// (https://clang.llvm.org/docs/AttributeReference.html#section-declspec-allocate)
-// or #pragma clang section directives
-// (https://clang.llvm.org/docs/LanguageExtensions.html#specifying-section-names-for-global-objects-pragma-clang-section).
-static bool hasExplicitSectionName(const GlobalVariable &GVar) {
- if (GVar.hasSection())
- return true;
-
- auto Attrs = GVar.getAttributes();
- if (Attrs.hasAttribute("bss-section") || Attrs.hasAttribute("data-section") ||
- Attrs.hasAttribute("relro-section") ||
- Attrs.hasAttribute("rodata-section"))
- return true;
- return false;
-}
-
bool MemProfUsePass::annotateGlobalVariables(
Module &M, const memprof::DataAccessProfData *DataAccessProf) {
if (!AnnotateStaticDataSectionPrefix || M.globals().empty())
return false;
if (!DataAccessProf) {
+ M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 0U);
M.getContext().diagnose(DiagnosticInfoPGOProfile(
MemoryProfileFileName.data(),
StringRef("Data access profiles not found in memprof. Ignore "
@@ -805,6 +814,7 @@ bool MemProfUsePass::annotateGlobalVariables(
DS_Warning));
return false;
}
+ M.addModuleFlag(Module::Warning, "EnableDataAccessProf", 1U);
bool Changed = false;
// Iterate all global variables in the module and annotate them based on
@@ -815,13 +825,9 @@ bool MemProfUsePass::annotateGlobalVariables(
for (GlobalVariable &GVar : M.globals()) {
assert(!GVar.getSectionPrefix().has_value() &&
"GVar shouldn't have section prefix yet");
- if (GVar.isDeclarationForLinker())
- continue;
-
- if (hasExplicitSectionName(GVar)) {
- ++NumOfMemProfExplicitSectionGlobalVars;
- LLVM_DEBUG(dbgs() << "Global variable " << GVar.getName()
- << " has explicit section name. Skip annotating.\n");
+ auto Kind = llvm::memprof::getAnnotationKind(GVar);
+ if (Kind != llvm::memprof::AnnotationKind::AnnotationOK) {
+ HandleUnsupportedAnnotationKinds(GVar, Kind);
continue;
}
@@ -831,7 +837,6 @@ bool MemProfUsePass::annotateGlobalVariables(
// TODO: Track string content hash in the profiles and compute it inside the
// compiler to categeorize the hotness string literals.
if (Name.starts_with(".str")) {
-
LLVM_DEBUG(dbgs() << "Skip annotating string literal " << Name << "\n");
continue;
}
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 56e0569..7cae94eb 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1295,6 +1295,24 @@ public:
return commonAlignment(InitialAlign, ElementSizeInBits / 8);
}
+ IntegerType *getIndexType(Value *Ptr) const {
+ return cast<IntegerType>(DL.getIndexType(Ptr->getType()));
+ }
+
+ Value *getIndex(Value *Ptr, uint64_t V) const {
+ return ConstantInt::get(getIndexType(Ptr), V);
+ }
+
+ Value *castToIndexType(Value *Ptr, Value *V, IRBuilder<> &Builder) const {
+ assert(isa<IntegerType>(V->getType()) &&
+ "Attempted to cast non-integral type to integer index");
+ // In case the data layout's index type differs in width from the type of
+ // the value we're given, truncate or zero extend to the appropriate width.
+ // We zero extend here as indices are unsigned.
+ return Builder.CreateZExtOrTrunc(V, getIndexType(Ptr),
+ V->getName() + ".cast");
+ }
+
/// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
/// vectors.
MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
@@ -1304,6 +1322,7 @@ public:
Type *VecTy = FixedVectorType::get(EltTy, Shape.getStride());
Value *EltPtr = Ptr;
MatrixTy Result;
+ Stride = castToIndexType(Ptr, Stride, Builder);
for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
Value *GEP = computeVectorAddr(
EltPtr, Builder.getIntN(Stride->getType()->getScalarSizeInBits(), I),
@@ -1325,14 +1344,14 @@ public:
ShapeInfo ResultShape, Type *EltTy,
IRBuilder<> &Builder) {
Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+ Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
ResultShape.NumColumns);
return loadMatrix(TileTy, TileStart, Align,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+ getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
ResultShape, Builder);
}
@@ -1363,14 +1382,15 @@ public:
MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+ Builder.CreateMul(J, getIndex(MatrixPtr, MatrixShape.getStride())), I);
Value *TileStart = Builder.CreateGEP(EltTy, MatrixPtr, Offset);
auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
StoreVal.getNumColumns());
storeMatrix(TileTy, StoreVal, TileStart, MAlign,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+ getIndex(MatrixPtr, MatrixShape.getStride()), IsVolatile,
+ Builder);
}
/// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
@@ -1380,6 +1400,7 @@ public:
IRBuilder<> &Builder) {
auto *VType = cast<FixedVectorType>(Ty);
Value *EltPtr = Ptr;
+ Stride = castToIndexType(Ptr, Stride, Builder);
for (auto Vec : enumerate(StoreVal.vectors())) {
Value *GEP = computeVectorAddr(
EltPtr,
@@ -2011,18 +2032,17 @@ public:
const unsigned TileM = std::min(M - K, unsigned(TileSize));
MatrixTy A =
loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
- LShape, Builder.getInt64(I), Builder.getInt64(K),
+ LShape, getIndex(APtr, I), getIndex(APtr, K),
{TileR, TileM}, EltType, Builder);
MatrixTy B =
loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
- RShape, Builder.getInt64(K), Builder.getInt64(J),
+ RShape, getIndex(BPtr, K), getIndex(BPtr, J),
{TileM, TileC}, EltType, Builder);
emitMatrixMultiply(Res, A, B, Builder, true, false,
getFastMathFlags(MatMul));
}
storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
- Builder.getInt64(I), Builder.getInt64(J), EltType,
- Builder);
+ getIndex(CPtr, I), getIndex(CPtr, J), EltType, Builder);
}
}
@@ -2254,15 +2274,14 @@ public:
/// Lower load instructions.
MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
IRBuilder<> &Builder) {
- return LowerLoad(Inst, Ptr, Inst->getAlign(),
- Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
- Builder);
+ return LowerLoad(Inst, Ptr, Inst->getAlign(), getIndex(Ptr, SI.getStride()),
+ Inst->isVolatile(), SI, Builder);
}
MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
Value *Ptr, IRBuilder<> &Builder) {
return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
- Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI,
+ getIndex(Ptr, SI.getStride()), Inst->isVolatile(), SI,
Builder);
}
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index b187208..32924e7 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -44,7 +44,7 @@ using namespace llvm;
STATISTIC(RemappedAtomMax, "Highest global NextAtomGroup (after mapping)");
void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
- auto CurGroup = DL->getAtomGroup();
+ uint64_t CurGroup = DL->getAtomGroup();
if (!CurGroup)
return;
@@ -62,21 +62,20 @@ void llvm::mapAtomInstance(const DebugLoc &DL, ValueToValueMapTy &VMap) {
RemappedAtomMax = std::max<uint64_t>(NewGroup, RemappedAtomMax);
}
-namespace {
-void collectDebugInfoFromInstructions(const Function &F,
- DebugInfoFinder &DIFinder) {
+static void collectDebugInfoFromInstructions(const Function &F,
+ DebugInfoFinder &DIFinder) {
const Module *M = F.getParent();
- if (M) {
- // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
- for (const auto &I : instructions(F))
- DIFinder.processInstruction(*M, I);
- }
+ if (!M)
+ return;
+ // Inspect instructions to process e.g. DILexicalBlocks of inlined functions
+ for (const Instruction &I : instructions(F))
+ DIFinder.processInstruction(*M, I);
}
// Create a predicate that matches the metadata that should be identity mapped
// during function cloning.
-MetadataPredicate createIdentityMDPredicate(const Function &F,
- CloneFunctionChangeType Changes) {
+static MetadataPredicate
+createIdentityMDPredicate(const Function &F, CloneFunctionChangeType Changes) {
if (Changes >= CloneFunctionChangeType::DifferentModule)
return [](const Metadata *MD) { return false; };
@@ -107,7 +106,6 @@ MetadataPredicate createIdentityMDPredicate(const Function &F,
return false;
};
}
-} // namespace
/// See comments in Cloning.h.
BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
@@ -213,10 +211,9 @@ void llvm::CloneFunctionMetadataInto(Function &NewFunc, const Function &OldFunc,
const MetadataPredicate *IdentityMD) {
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
OldFunc.getAllMetadata(MDs);
- for (auto MD : MDs) {
- NewFunc.addMetadata(MD.first,
- *MapMetadata(MD.second, VMap, RemapFlag, TypeMapper,
- Materializer, IdentityMD));
+ for (const auto &[Kind, MD] : MDs) {
+ NewFunc.addMetadata(Kind, *MapMetadata(MD, VMap, RemapFlag, TypeMapper,
+ Materializer, IdentityMD));
}
}
@@ -235,7 +232,6 @@ void llvm::CloneFunctionBodyInto(Function &NewFunc, const Function &OldFunc,
// appropriate. Note that we save BE this way in order to handle cloning of
// recursive functions into themselves.
for (const BasicBlock &BB : OldFunc) {
-
// Create a new basic block and copy instructions into it!
BasicBlock *CBB =
CloneBasicBlock(&BB, VMap, NameSuffix, &NewFunc, CodeInfo);
@@ -321,7 +317,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// Cloning is always a Module level operation, since Metadata needs to be
// cloned.
- const auto RemapFlag = RF_None;
+ const RemapFlags RemapFlag = RF_None;
CloneFunctionMetadataInto(*NewFunc, *OldFunc, VMap, RemapFlag, TypeMapper,
Materializer, &IdentityMD);
@@ -346,8 +342,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// visiting the metadata attached to global values, which would allow this
// code to be deleted. Alternatively, perhaps give responsibility for this
// update to CloneFunctionInto's callers.
- auto *NewModule = NewFunc->getParent();
- auto *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+ Module *NewModule = NewFunc->getParent();
+ NamedMDNode *NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
// Avoid multiple insertions of the same DICompileUnit to NMD.
SmallPtrSet<const void *, 8> Visited(llvm::from_range, NMD->operands());
@@ -355,7 +351,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
// the function (e.g. as instructions' scope).
DebugInfoFinder DIFinder;
collectDebugInfoFromInstructions(*OldFunc, DIFinder);
- for (auto *Unit : DIFinder.compile_units()) {
+ for (DICompileUnit *Unit : DIFinder.compile_units()) {
MDNode *MappedUnit =
MapMetadata(Unit, VMap, RF_None, TypeMapper, Materializer);
if (Visited.insert(MappedUnit).second)
@@ -821,17 +817,16 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
--PredCount[Pred];
// Figure out how many entries to remove from each PHI.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- ++PredCount[PN->getIncomingBlock(i)];
+ for (BasicBlock *Pred : PN->blocks())
+ ++PredCount[Pred];
// At this point, the excess predecessor entries are positive in the
// map. Loop over all of the PHIs and remove excess predecessor
// entries.
BasicBlock::iterator I = NewBB->begin();
for (; (PN = dyn_cast<PHINode>(I)); ++I) {
- for (const auto &PCI : PredCount) {
- BasicBlock *Pred = PCI.first;
- for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+ for (const auto &[Pred, Count] : PredCount) {
+ for ([[maybe_unused]] unsigned _ : llvm::seq<unsigned>(Count))
PN->removeIncomingValue(Pred, false);
}
}
@@ -866,8 +861,8 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
// As phi-nodes have been now remapped, allow incremental simplification of
// newly-cloned instructions.
const DataLayout &DL = NewFunc->getDataLayout();
- for (const auto &BB : *OldFunc) {
- for (const auto &I : BB) {
+ for (const BasicBlock &BB : *OldFunc) {
+ for (const Instruction &I : BB) {
auto *NewI = dyn_cast_or_null<Instruction>(VMap.lookup(&I));
if (!NewI)
continue;
@@ -997,8 +992,8 @@ void llvm::CloneAndPruneFunctionInto(
void llvm::remapInstructionsInBlocks(ArrayRef<BasicBlock *> Blocks,
ValueToValueMapTy &VMap) {
// Rewrite the code to refer to itself.
- for (auto *BB : Blocks) {
- for (auto &Inst : *BB) {
+ for (BasicBlock *BB : Blocks) {
+ for (Instruction &Inst : *BB) {
RemapDbgRecordRange(Inst.getModule(), Inst.getDbgRecordRange(), VMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
RemapInstruction(&Inst, VMap,
@@ -1151,9 +1146,9 @@ void llvm::cloneNoAliasScopes(ArrayRef<MDNode *> NoAliasDeclScopes,
StringRef Ext, LLVMContext &Context) {
MDBuilder MDB(Context);
- for (auto *ScopeList : NoAliasDeclScopes) {
- for (const auto &MDOperand : ScopeList->operands()) {
- if (MDNode *MD = dyn_cast<MDNode>(MDOperand)) {
+ for (MDNode *ScopeList : NoAliasDeclScopes) {
+ for (const MDOperand &MDOp : ScopeList->operands()) {
+ if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
AliasScopeNode SNANode(MD);
std::string Name;
@@ -1177,7 +1172,7 @@ void llvm::adaptNoAliasScopes(Instruction *I,
auto CloneScopeList = [&](const MDNode *ScopeList) -> MDNode * {
bool NeedsReplacement = false;
SmallVector<Metadata *, 8> NewScopeList;
- for (const auto &MDOp : ScopeList->operands()) {
+ for (const MDOperand &MDOp : ScopeList->operands()) {
if (MDNode *MD = dyn_cast<MDNode>(MDOp)) {
if (auto *NewMD = ClonedScopes.lookup(MD)) {
NewScopeList.push_back(NewMD);
@@ -1193,12 +1188,12 @@ void llvm::adaptNoAliasScopes(Instruction *I,
};
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(I))
- if (auto *NewScopeList = CloneScopeList(Decl->getScopeList()))
+ if (MDNode *NewScopeList = CloneScopeList(Decl->getScopeList()))
Decl->setScopeList(NewScopeList);
auto replaceWhenNeeded = [&](unsigned MD_ID) {
if (const MDNode *CSNoAlias = I->getMetadata(MD_ID))
- if (auto *NewScopeList = CloneScopeList(CSNoAlias))
+ if (MDNode *NewScopeList = CloneScopeList(CSNoAlias))
I->setMetadata(MD_ID, NewScopeList);
};
replaceWhenNeeded(LLVMContext::MD_noalias);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index d7bf791..fb39fdd 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -11,11 +11,11 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/IteratedDominanceFrontier.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
@@ -112,7 +112,7 @@ struct BBValueInfo {
void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
SmallVectorImpl<PHINode *> *InsertedPHIs) {
DenseMap<BasicBlock *, BBValueInfo> BBInfos;
- for (auto &R : Rewrites) {
+ for (RewriteInfo &R : Rewrites) {
BBInfos.clear();
// Compute locations for new phi-nodes.
@@ -145,7 +145,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
BBInfos[BB].LiveOutValue = V;
// We've computed IDF, now insert new phi-nodes there.
- for (auto *FrontierBB : IDFBlocks) {
+ for (BasicBlock *FrontierBB : IDFBlocks) {
IRBuilder<> B(FrontierBB, FrontierBB->begin());
PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
BBInfos[FrontierBB].LiveInValue = PN;
@@ -156,7 +156,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
// IsLiveOut indicates whether we are computing live-out values (true) or
// live-in values (false).
auto ComputeValue = [&](BasicBlock *BB, bool IsLiveOut) -> Value * {
- auto *BBInfo = &BBInfos[BB];
+ BBValueInfo *BBInfo = &BBInfos[BB];
if (IsLiveOut && BBInfo->LiveOutValue)
return BBInfo->LiveOutValue;
@@ -187,7 +187,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
if (!V)
V = UndefValue::get(R.Ty);
- for (auto *BBInfo : Stack)
+ for (BBValueInfo *BBInfo : Stack)
// Loop above can insert new entries into the BBInfos map: assume the
// map shouldn't grow due to [1] and BBInfo references are valid.
BBInfo->LiveInValue = V;
@@ -196,7 +196,7 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
};
// Fill in arguments of the inserted PHIs.
- for (auto *BB : IDFBlocks) {
+ for (BasicBlock *BB : IDFBlocks) {
auto *PHI = cast<PHINode>(&BB->front());
for (BasicBlock *Pred : PredCache.get(BB))
PHI->addIncoming(ComputeValue(Pred, /*IsLiveOut=*/true), Pred);
@@ -222,3 +222,96 @@ void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
}
}
}
+
+// Perform a single pass of simplification over the worklist of PHIs.
+// This should be called after RewriteAllUses() because simplifying PHIs
+// immediately after creation would require updating all references to those
+// PHIs in the BBValueInfo structures, which would necessitate additional
+// reference tracking overhead.
+static void simplifyPass(MutableArrayRef<PHINode *> Worklist,
+ const DataLayout &DL) {
+ for (PHINode *&PHI : Worklist) {
+ if (Value *Simplified = simplifyInstruction(PHI, DL)) {
+ PHI->replaceAllUsesWith(Simplified);
+ PHI->eraseFromParent();
+ PHI = nullptr; // Mark as removed.
+ }
+ }
+}
+
+#ifndef NDEBUG // Should this be under EXPENSIVE_CHECKS?
+// New PHI nodes should not reference one another but they may reference
+// themselves or existing PHI nodes, and existing PHI nodes may reference new
+// PHI nodes.
+static bool
+PHIAreRefEachOther(const iterator_range<BasicBlock::phi_iterator> NewPHIs) {
+ SmallPtrSet<PHINode *, 8> NewPHISet;
+ for (PHINode &PN : NewPHIs)
+ NewPHISet.insert(&PN);
+ for (PHINode &PHI : NewPHIs) {
+ for (Value *V : PHI.incoming_values()) {
+ PHINode *IncPHI = dyn_cast<PHINode>(V);
+ if (IncPHI && IncPHI != &PHI && NewPHISet.contains(IncPHI))
+ return true;
+ }
+ }
+ return false;
+}
+#endif
+
+static bool replaceIfIdentical(PHINode &PHI, PHINode &ReplPHI) {
+ if (!PHI.isIdenticalToWhenDefined(&ReplPHI))
+ return false;
+ PHI.replaceAllUsesWith(&ReplPHI);
+ PHI.eraseFromParent();
+ return true;
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+ BasicBlock::phi_iterator FirstExistingPN) {
+ assert(!PHIAreRefEachOther(make_range(BB->phis().begin(), FirstExistingPN)));
+
+ // Deduplicate new PHIs first to reduce the number of comparisons on the
+ // following new -> existing pass.
+ bool Changed = false;
+ for (auto I = BB->phis().begin(); I != FirstExistingPN; ++I) {
+ for (auto J = std::next(I); J != FirstExistingPN;) {
+ Changed |= replaceIfIdentical(*J++, *I);
+ }
+ }
+
+ // Iterate over existing PHIs and replace identical new PHIs.
+ for (PHINode &ExistingPHI : make_range(FirstExistingPN, BB->phis().end())) {
+ auto I = BB->phis().begin();
+ assert(I != FirstExistingPN); // Should be at least one new PHI.
+ do {
+ Changed |= replaceIfIdentical(*I++, ExistingPHI);
+ } while (I != FirstExistingPN);
+ if (BB->phis().begin() == FirstExistingPN)
+ return Changed;
+ }
+ return Changed;
+}
+
+static void deduplicatePass(ArrayRef<PHINode *> Worklist) {
+ SmallDenseMap<BasicBlock *, unsigned> BBs;
+ for (PHINode *PHI : Worklist) {
+ if (PHI)
+ ++BBs[PHI->getParent()];
+ }
+
+ for (auto [BB, NumNewPHIs] : BBs) {
+ auto FirstExistingPN = std::next(BB->phis().begin(), NumNewPHIs);
+ EliminateNewDuplicatePHINodes(BB, FirstExistingPN);
+ }
+}
+
+void SSAUpdaterBulk::RewriteAndOptimizeAllUses(DominatorTree &DT) {
+ SmallVector<PHINode *, 4> PHIs;
+ RewriteAllUses(&DT, &PHIs);
+ if (PHIs.empty())
+ return;
+
+ simplifyPass(PHIs, PHIs.front()->getParent()->getDataLayout());
+ deduplicatePass(PHIs);
+}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 45cee1e..9035e58 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -526,7 +526,7 @@ Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
// Recognize the canonical representation of an unsimplifed urem.
const SCEV *URemLHS = nullptr;
const SCEV *URemRHS = nullptr;
- if (SE.matchURem(S, URemLHS, URemRHS)) {
+ if (match(S, m_scev_URem(m_SCEV(URemLHS), m_SCEV(URemRHS), SE))) {
Value *LHS = expand(URemLHS);
Value *RHS = expand(URemRHS);
return InsertBinop(Instruction::URem, LHS, RHS, SCEV::FlagAnyWrap,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a6f4bec..88af2cf 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10659,7 +10659,8 @@ class InstructionsCompatibilityAnalysis {
static bool isSupportedOpcode(const unsigned Opcode) {
return Opcode == Instruction::Add || Opcode == Instruction::LShr ||
Opcode == Instruction::Shl || Opcode == Instruction::SDiv ||
- Opcode == Instruction::UDiv;
+ Opcode == Instruction::UDiv || Opcode == Instruction::And ||
+ Opcode == Instruction::Or || Opcode == Instruction::Xor;
}
/// Identifies the best candidate value, which represents main opcode
@@ -10984,6 +10985,9 @@ public:
case Instruction::Shl:
case Instruction::SDiv:
case Instruction::UDiv:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
break;
default:
@@ -19456,7 +19460,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
assert(getNumElements(Cond->getType()) == TrueNumElements &&
"Cannot vectorize Instruction::Select");
- Value *V = Builder.CreateSelect(Cond, True, False);
+ Value *V =
+ Builder.CreateSelectWithUnknownProfile(Cond, True, False, DEBUG_TYPE);
V = FinalShuffle(V, E);
E->VectorizedValue = V;
@@ -23576,18 +23581,19 @@ class HorizontalReduction {
switch (Kind) {
case RecurKind::Or: {
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
- return Builder.CreateSelect(
+ return Builder.CreateSelectWithUnknownProfile(
LHS, ConstantInt::getAllOnesValue(CmpInst::makeCmpResultType(OpTy)),
- RHS, Name);
+ RHS, DEBUG_TYPE, Name);
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
}
case RecurKind::And: {
if (UseSelect && OpTy == CmpInst::makeCmpResultType(OpTy))
- return Builder.CreateSelect(
+ return Builder.CreateSelectWithUnknownProfile(
LHS, RHS,
- ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)), Name);
+ ConstantInt::getNullValue(CmpInst::makeCmpResultType(OpTy)),
+ DEBUG_TYPE, Name);
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
Name);
@@ -23608,7 +23614,8 @@ class HorizontalReduction {
if (UseSelect) {
CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(Kind);
Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ return Builder.CreateSelectWithUnknownProfile(Cmp, LHS, RHS, DEBUG_TYPE,
+ Name);
}
[[fallthrough]];
case RecurKind::FMax:
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1fea068..0101942 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -635,9 +635,9 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
const VPRecipeBase *R = &VPBB->back();
bool IsSwitch = isa<VPInstruction>(R) &&
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
- bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
- match(R, m_BranchOnCond(m_VPValue())) ||
- match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
+ bool IsCondBranch =
+ isa<VPBranchOnMaskRecipe>(R) ||
+ match(R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()));
(void)IsCondBranch;
(void)IsSwitch;
if (VPBB->getNumSuccessors() == 2 ||
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fb696be..8ca3bed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1064,6 +1064,7 @@ public:
ResumeForEpilogue,
/// Returns the value for vscale.
VScale,
+ OpsEnd = VScale,
};
/// Returns true if this VPInstruction generates scalar values for all lanes.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 81deba2..c0147ce 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -433,8 +433,7 @@ static void addCanonicalIVRecipes(VPlan &Plan, VPBasicBlock *HeaderVPBB,
// We are about to replace the branch to exit the region. Remove the original
// BranchOnCond, if there is any.
DebugLoc LatchDL = DL;
- if (!LatchVPBB->empty() &&
- match(&LatchVPBB->back(), m_BranchOnCond(m_VPValue()))) {
+ if (!LatchVPBB->empty() && match(&LatchVPBB->back(), m_BranchOnCond())) {
LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
LatchVPBB->getTerminator()->eraseFromParent();
}
@@ -480,8 +479,7 @@ static void createExtractsForLiveOuts(VPlan &Plan, VPBasicBlock *MiddleVPBB) {
static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
PredicatedScalarEvolution &PSE, Loop *TheLoop) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
auto *HeaderVPBB = cast<VPBasicBlock>(Plan.getEntry()->getSingleSuccessor());
canonicalHeaderAndLatch(HeaderVPBB, VPDT);
@@ -623,8 +621,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
}
void VPlanTransforms::createLoopRegions(VPlan &Plan) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
for (VPBlockBase *HeaderVPB : vp_post_order_shallow(Plan.getEntry()))
if (canonicalHeaderAndLatch(HeaderVPB, VPDT))
createLoopRegion(Plan, HeaderVPB);
@@ -875,8 +872,7 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
auto *VPBB = cast<VPBasicBlock>(VPB);
for (auto &R : *VPBB) {
- if (R.mayWriteToMemory() &&
- !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (R.mayWriteToMemory() && !match(&R, m_BranchOnCount()))
return false;
}
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 577432f..44506f5a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -39,7 +39,6 @@ class VPDominatorTree : public DominatorTreeBase<VPBlockBase, false> {
using Base = DominatorTreeBase<VPBlockBase, false>;
public:
- VPDominatorTree() = default;
explicit VPDominatorTree(VPlan &Plan) { recalculate(Plan); }
/// Returns true if \p A properly dominates \p B.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
index 555efea..b42b049 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -344,6 +344,10 @@ m_Freeze(const Op0_t &Op0) {
return m_VPInstruction<Instruction::Freeze>(Op0);
}
+inline VPInstruction_match<VPInstruction::BranchOnCond> m_BranchOnCond() {
+ return m_VPInstruction<VPInstruction::BranchOnCond>();
+}
+
template <typename Op0_t>
inline VPInstruction_match<VPInstruction::BranchOnCond, Op0_t>
m_BranchOnCond(const Op0_t &Op0) {
@@ -374,6 +378,10 @@ m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) {
return m_VPInstruction<VPInstruction::ActiveLaneMask>(Op0, Op1, Op2);
}
+inline VPInstruction_match<VPInstruction::BranchOnCount> m_BranchOnCount() {
+ return m_VPInstruction<VPInstruction::BranchOnCount>();
+}
+
template <typename Op0_t, typename Op1_t>
inline VPInstruction_match<VPInstruction::BranchOnCount, Op0_t, Op1_t>
m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9bb8820..40b7e8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1658,7 +1658,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
auto *Term = &ExitingVPBB->back();
VPValue *Cond;
ScalarEvolution &SE = *PSE.getSE();
- if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
+ if (match(Term, m_BranchOnCount()) ||
match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask(
m_VPValue(), m_VPValue(), m_VPValue()))))) {
// Try to simplify the branch condition if TC <= VF * UF when the latch
@@ -1909,8 +1909,7 @@ static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR,
bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
VPBuilder &LoopBuilder) {
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
SmallVector<VPFirstOrderRecurrencePHIRecipe *> RecurrencePhis;
for (VPRecipeBase &R :
@@ -1992,6 +1991,13 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
return std::make_pair(true, I->getVectorIntrinsicID());
})
+ .Case<VPVectorPointerRecipe>([](auto *I) {
+ // For recipes that do not directly map to LLVM IR instructions,
+ // assign opcodes after the last VPInstruction opcode (which is also
+ // after the last IR Instruction opcode), based on the VPDefID.
+ return std::make_pair(false,
+ VPInstruction::OpsEnd + 1 + I->getVPDefID());
+ })
.Default([](auto *) { return std::nullopt; });
}
@@ -2015,11 +2021,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
static bool canHandle(const VPSingleDefRecipe *Def) {
// We can extend the list of handled recipes in the future,
// provided we account for the data embedded in them while checking for
- // equality or hashing. We assign VPVectorEndPointerRecipe the GEP opcode,
- // as it is essentially a GEP with different semantics.
- auto C = isa<VPVectorPointerRecipe>(Def)
- ? std::make_pair(false, Instruction::GetElementPtr)
- : getOpcodeOrIntrinsicID(Def);
+ // equality or hashing.
+ auto C = getOpcodeOrIntrinsicID(Def);
// The issue with (Insert|Extract)Value is that the index of the
// insert/extract is not a proper operand in LLVM IR, and hence also not in
@@ -2058,6 +2061,8 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
!equal(L->operands(), R->operands()))
return false;
+ assert(getOpcodeOrIntrinsicID(L) && getOpcodeOrIntrinsicID(R) &&
+ "must have valid opcode info for both recipes");
if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
if (LFlags->hasPredicate() &&
LFlags->getPredicate() !=
@@ -3021,8 +3026,7 @@ void VPlanTransforms::createInterleaveGroups(
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
for (const auto *IG : InterleaveGroups) {
auto *Start =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
@@ -3398,9 +3402,8 @@ void VPlanTransforms::handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB,
VPBuilder Builder(LatchVPBB->getTerminator());
VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
- assert(
- match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond(m_VPValue())) &&
- "Terminator must be be BranchOnCond");
+ assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
+ "Terminator must be be BranchOnCond");
VPValue *CondOfEarlyExitingVPBB =
EarlyExitingVPBB->getTerminator()->getOperand(0);
auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
@@ -3662,8 +3665,7 @@ void VPlanTransforms::materializeBroadcasts(VPlan &Plan) {
return;
#ifndef NDEBUG
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
#endif
SmallVector<VPValue *> VPValues;
@@ -4009,8 +4011,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
unsigned VFMinVal = VF.getKnownMinValue();
SmallVector<VPInterleaveRecipe *> StoreGroups;
for (auto &R : *VectorLoop->getEntryBasicBlock()) {
- if (isa<VPCanonicalIVPHIRecipe>(&R) ||
- match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
continue;
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe>(&R) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index 5e7f19f..1c4adfc 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -259,8 +259,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
/// Handle non-header-phi recipes.
void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
- if (match(&R, m_BranchOnCond(m_VPValue())) ||
- match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+ if (match(&R, m_CombineOr(m_BranchOnCond(), m_BranchOnCount())))
return;
if (auto *VPI = dyn_cast<VPInstruction>(&R)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 013ea2e..5262af6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -24,6 +24,7 @@
#define DEBUG_TYPE "loop-vectorize"
using namespace llvm;
+using namespace VPlanPatternMatch;
namespace {
class VPlanVerifier {
@@ -198,7 +199,6 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
}
// EVLIVIncrement is only used by EVLIV & BranchOnCount.
// Having more than two users is unexpected.
- using namespace llvm::VPlanPatternMatch;
if (I->getOpcode() != VPInstruction::Broadcast &&
I->getNumUsers() != 1 &&
(I->getNumUsers() != 2 ||
@@ -479,8 +479,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
}
auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exiting->end()));
- if (!LastInst || (LastInst->getOpcode() != VPInstruction::BranchOnCount &&
- LastInst->getOpcode() != VPInstruction::BranchOnCond)) {
+ if (!match(LastInst, m_CombineOr(m_BranchOnCond(), m_BranchOnCount()))) {
errs() << "VPlan vector loop exit must end with BranchOnCount or "
"BranchOnCond VPInstruction\n";
return false;
@@ -490,8 +489,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) {
}
bool llvm::verifyVPlanIsValid(const VPlan &Plan, bool VerifyLate) {
- VPDominatorTree VPDT;
- VPDT.recalculate(const_cast<VPlan &>(Plan));
+ VPDominatorTree VPDT(const_cast<VPlan &>(Plan));
VPTypeAnalysis TypeInfo(Plan);
VPlanVerifier Verifier(VPDT, TypeInfo, VerifyLate);
return Verifier.verify(Plan);
diff --git a/llvm/lib/XRay/BlockIndexer.cpp b/llvm/lib/XRay/BlockIndexer.cpp
index f4ba0eb..d0c6853 100644
--- a/llvm/lib/XRay/BlockIndexer.cpp
+++ b/llvm/lib/XRay/BlockIndexer.cpp
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/BlockIndexer.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BlockIndexer::visit(BufferExtents &) { return Error::success(); }
@@ -89,6 +89,3 @@ Error BlockIndexer::flush() {
CurrentBlock.WallclockTime = nullptr;
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockPrinter.cpp b/llvm/lib/XRay/BlockPrinter.cpp
index 63a60c3..d85be5b 100644
--- a/llvm/lib/XRay/BlockPrinter.cpp
+++ b/llvm/lib/XRay/BlockPrinter.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/BlockPrinter.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BlockPrinter::visit(BufferExtents &R) {
OS << "\n[New Block]\n";
@@ -108,6 +108,3 @@ Error BlockPrinter::visit(EndBufferRecord &R) {
auto E = RP.visit(R);
return E;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/BlockVerifier.cpp b/llvm/lib/XRay/BlockVerifier.cpp
index 99f255e..e39f6b6 100644
--- a/llvm/lib/XRay/BlockVerifier.cpp
+++ b/llvm/lib/XRay/BlockVerifier.cpp
@@ -10,19 +10,18 @@
#include <bitset>
-namespace llvm {
-namespace xray {
-namespace {
+using namespace llvm;
+using namespace llvm::xray;
-constexpr unsigned long long mask(BlockVerifier::State S) {
+static constexpr unsigned long long mask(BlockVerifier::State S) {
return 1uLL << static_cast<std::size_t>(S);
}
-constexpr std::size_t number(BlockVerifier::State S) {
+static constexpr std::size_t number(BlockVerifier::State S) {
return static_cast<std::size_t>(S);
}
-StringRef recordToString(BlockVerifier::State R) {
+static StringRef recordToString(BlockVerifier::State R) {
switch (R) {
case BlockVerifier::State::BufferExtents:
return "BufferExtents";
@@ -53,6 +52,8 @@ StringRef recordToString(BlockVerifier::State R) {
llvm_unreachable("Unkown state!");
}
+namespace {
+
struct Transition {
BlockVerifier::State From;
std::bitset<number(BlockVerifier::State::StateMax)> ToStates;
@@ -133,7 +134,7 @@ Error BlockVerifier::transition(State To) {
CurrentRecord = To;
return Error::success();
-} // namespace xray
+}
Error BlockVerifier::visit(BufferExtents &) {
return transition(State::BufferExtents);
@@ -201,6 +202,3 @@ Error BlockVerifier::verify() {
}
void BlockVerifier::reset() { CurrentRecord = State::Unknown; }
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecordProducer.cpp b/llvm/lib/XRay/FDRRecordProducer.cpp
index 479b710..0f4eed1 100644
--- a/llvm/lib/XRay/FDRRecordProducer.cpp
+++ b/llvm/lib/XRay/FDRRecordProducer.cpp
@@ -10,8 +10,8 @@
#include <cstdint>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
namespace {
@@ -31,8 +31,9 @@ enum MetadataRecordKinds : uint8_t {
// This is an end marker, used to identify the upper bound for this enum.
EnumEndMarker,
};
+} // namespace
-Expected<std::unique_ptr<Record>>
+static Expected<std::unique_ptr<Record>>
metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
if (T >= static_cast<uint8_t>(MetadataRecordKinds::EnumEndMarker))
@@ -72,12 +73,10 @@ metadataRecordType(const XRayFileHeader &Header, uint8_t T) {
llvm_unreachable("Unhandled MetadataRecordKinds enum value");
}
-constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
+static constexpr bool isMetadataIntroducer(uint8_t FirstByte) {
return FirstByte & 0x01u;
}
-} // namespace
-
Expected<std::unique_ptr<Record>>
FileBasedRecordProducer::findNextBufferExtent() {
// We seek one byte at a time until we find a suitable buffer extents metadata
@@ -193,6 +192,3 @@ Expected<std::unique_ptr<Record>> FileBasedRecordProducer::produce() {
assert(R != nullptr);
return std::move(R);
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRRecords.cpp b/llvm/lib/XRay/FDRRecords.cpp
index ff315d3..a18f733 100644
--- a/llvm/lib/XRay/FDRRecords.cpp
+++ b/llvm/lib/XRay/FDRRecords.cpp
@@ -12,8 +12,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error BufferExtents::apply(RecordVisitor &V) { return V.visit(*this); }
Error WallclockRecord::apply(RecordVisitor &V) { return V.visit(*this); }
@@ -61,6 +61,3 @@ StringRef Record::kindToString(RecordKind K) {
}
return "Unknown";
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index b68e997..991e6e5 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRTraceExpander.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
void TraceExpander::resetCurrentRecord() {
if (BuildingRecord)
@@ -126,6 +126,3 @@ Error TraceExpander::flush() {
resetCurrentRecord();
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index fb59125..3e320a6 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -12,8 +12,8 @@
#include "llvm/XRay/FDRTraceWriter.h"
#include <tuple>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
namespace {
@@ -37,9 +37,10 @@ template <size_t Index> struct IndexedWriter {
return 0;
}
};
+} // namespace
template <uint8_t Kind, class... Values>
-Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
+static Error writeMetadata(support::endian::Writer &OS, Values &&...Ds) {
// The first bit in the first byte of metadata records is always set to 1, so
// we ensure this is the case when we write out the first byte of the record.
uint8_t FirstByte = (static_cast<uint8_t>(Kind) << 1) | uint8_t{0x01u};
@@ -54,8 +55,6 @@ Error writeMetadata(support::endian::Writer &OS, Values &&... Ds) {
return Error::success();
}
-} // namespace
-
FDRTraceWriter::FDRTraceWriter(raw_ostream &O, const XRayFileHeader &H)
: OS(O, llvm::endianness::native) {
// We need to re-construct a header, by writing the fields we care about for
@@ -146,6 +145,3 @@ Error FDRTraceWriter::visit(FunctionRecord &R) {
OS.write(R.delta());
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/FileHeaderReader.cpp b/llvm/lib/XRay/FileHeaderReader.cpp
index 6b6daf9..681cef7 100644
--- a/llvm/lib/XRay/FileHeaderReader.cpp
+++ b/llvm/lib/XRay/FileHeaderReader.cpp
@@ -7,12 +7,13 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FileHeaderReader.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
// Populates the FileHeader reference by reading the first 32 bytes of the file.
-Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
- uint64_t &OffsetPtr) {
+Expected<XRayFileHeader>
+xray::readBinaryFormatHeader(DataExtractor &HeaderExtractor,
+ uint64_t &OffsetPtr) {
// FIXME: Maybe deduce whether the data is little or big-endian using some
// magic bytes in the beginning of the file?
@@ -68,6 +69,3 @@ Expected<XRayFileHeader> readBinaryFormatHeader(DataExtractor &HeaderExtractor,
OffsetPtr += 16;
return std::move(FileHeader);
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/LogBuilderConsumer.cpp b/llvm/lib/XRay/LogBuilderConsumer.cpp
index ffb49f9..f0fc336 100644
--- a/llvm/lib/XRay/LogBuilderConsumer.cpp
+++ b/llvm/lib/XRay/LogBuilderConsumer.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecordConsumer.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error LogBuilderConsumer::consume(std::unique_ptr<Record> R) {
if (!R)
@@ -32,6 +32,3 @@ Error PipelineConsumer::consume(std::unique_ptr<Record> R) {
Result = joinErrors(std::move(Result), R->apply(*V));
return Result;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Profile.cpp b/llvm/lib/XRay/Profile.cpp
index 1b340e5..ecb767b 100644
--- a/llvm/lib/XRay/Profile.cpp
+++ b/llvm/lib/XRay/Profile.cpp
@@ -18,8 +18,8 @@
#include "llvm/XRay/Trace.h"
#include <memory>
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Profile::Profile(const Profile &O) {
// We need to re-create all the tries from the original (O), into the current
@@ -46,6 +46,7 @@ struct BlockHeader {
uint32_t Number;
uint64_t Thread;
};
+} // namespace
static Expected<BlockHeader> readBlockHeader(DataExtractor &Extractor,
uint64_t &Offset) {
@@ -115,8 +116,6 @@ static Expected<Profile::Data> readData(DataExtractor &Extractor,
return D;
}
-} // namespace
-
Error Profile::addBlock(Block &&B) {
if (B.PathData.empty())
return make_error<StringError>(
@@ -189,7 +188,7 @@ Profile::PathID Profile::internPath(ArrayRef<FuncID> P) {
return Node->ID;
}
-Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByThread(const Profile &L, const Profile &R) {
Profile Merged;
using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
using PathDataMapPtr = std::unique_ptr<PathDataMap>;
@@ -228,7 +227,7 @@ Profile mergeProfilesByThread(const Profile &L, const Profile &R) {
return Merged;
}
-Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
+Profile xray::mergeProfilesByStack(const Profile &L, const Profile &R) {
Profile Merged;
using PathDataMap = DenseMap<Profile::PathID, Profile::Data>;
PathDataMap PathData;
@@ -258,7 +257,7 @@ Profile mergeProfilesByStack(const Profile &L, const Profile &R) {
return Merged;
}
-Expected<Profile> loadProfile(StringRef Filename) {
+Expected<Profile> xray::loadProfile(StringRef Filename) {
Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
if (!FdOrErr)
return FdOrErr.takeError();
@@ -322,7 +321,7 @@ struct StackEntry {
} // namespace
-Expected<Profile> profileFromTrace(const Trace &T) {
+Expected<Profile> xray::profileFromTrace(const Trace &T) {
Profile P;
// The implementation of the algorithm re-creates the execution of
@@ -397,6 +396,3 @@ Expected<Profile> profileFromTrace(const Trace &T) {
return P;
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordInitializer.cpp b/llvm/lib/XRay/RecordInitializer.cpp
index 68ab3db..83d5f14 100644
--- a/llvm/lib/XRay/RecordInitializer.cpp
+++ b/llvm/lib/XRay/RecordInitializer.cpp
@@ -7,8 +7,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/XRay/FDRRecords.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error RecordInitializer::visit(BufferExtents &R) {
if (!E.isValidOffsetForDataOfSize(OffsetPtr, sizeof(uint64_t)))
@@ -426,6 +426,3 @@ Error RecordInitializer::visit(FunctionRecord &R) {
assert(FunctionRecord::kFunctionRecordSize == (OffsetPtr - BeginOffset));
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/RecordPrinter.cpp b/llvm/lib/XRay/RecordPrinter.cpp
index 32d4210..b9b7a16 100644
--- a/llvm/lib/XRay/RecordPrinter.cpp
+++ b/llvm/lib/XRay/RecordPrinter.cpp
@@ -9,8 +9,8 @@
#include "llvm/Support/FormatVariadic.h"
-namespace llvm {
-namespace xray {
+using namespace llvm;
+using namespace llvm::xray;
Error RecordPrinter::visit(BufferExtents &R) {
OS << formatv("<Buffer: size = {0} bytes>", R.size()) << Delim;
@@ -103,6 +103,3 @@ Error RecordPrinter::visit(FunctionRecord &R) {
OS << Delim;
return Error::success();
}
-
-} // namespace xray
-} // namespace llvm
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 74515b1..14a3f01 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -29,11 +29,9 @@ using namespace llvm;
using namespace llvm::xray;
using llvm::yaml::Input;
-namespace {
-
-Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
- XRayFileHeader &FileHeader,
- std::vector<XRayRecord> &Records) {
+static Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
+ XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
if (Data.size() < 32)
return make_error<StringError>(
"Not enough bytes for an XRay log.",
@@ -265,8 +263,9 @@ Error loadNaiveFormatLog(StringRef Data, bool IsLittleEndian,
/// what FunctionRecord instances use, and we no longer need to include the CPU
/// id in the CustomEventRecord.
///
-Error loadFDRLog(StringRef Data, bool IsLittleEndian,
- XRayFileHeader &FileHeader, std::vector<XRayRecord> &Records) {
+static Error loadFDRLog(StringRef Data, bool IsLittleEndian,
+ XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
if (Data.size() < 32)
return createStringError(std::make_error_code(std::errc::invalid_argument),
@@ -348,8 +347,8 @@ Error loadFDRLog(StringRef Data, bool IsLittleEndian,
return Error::success();
}
-Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
- std::vector<XRayRecord> &Records) {
+static Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+ std::vector<XRayRecord> &Records) {
YAMLXRayTrace Trace;
Input In(Data);
In >> Trace;
@@ -376,7 +375,6 @@ Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
});
return Error::success();
}
-} // namespace
Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
index e784d25..acac2c9 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint.ll
@@ -447,6 +447,84 @@ bb5:
ret void
}
+define void @pr46786_c26_char_cmp_ops_swapped(ptr %arg, ptr %arg1, ptr %arg2) {
+; X64-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X64-NEXT: Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT: %i4 = ptrtoint ptr %arg to i64
+; X64-NEXT: --> (ptrtoint ptr %arg to i64) U: full-set S: full-set
+; X64-NEXT: %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X64-NEXT: --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i8 = load i8, ptr %i7, align 1
+; X64-NEXT: --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i9 = ptrtoint ptr %i7 to i64
+; X64-NEXT: --> {(ptrtoint ptr %arg to i64),+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i10 = sub i64 %i9, %i4
+; X64-NEXT: --> {0,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64)) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X64-NEXT: --> {%arg2,+,1}<nw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg2) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: %i12 = load i8, ptr %i11, align 1
+; X64-NEXT: --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i13 = add i8 %i12, %i8
+; X64-NEXT: --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X64-NEXT: %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X64-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64) + %arg) LoopDispositions: { %bb6: Computable }
+; X64-NEXT: Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X64-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT: Loop %bb6: constant max backedge-taken count is i64 -1
+; X64-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i64)) + (ptrtoint ptr %arg1 to i64))
+; X64-NEXT: Loop %bb6: Trip multiple is 1
+;
+; X32-LABEL: 'pr46786_c26_char_cmp_ops_swapped'
+; X32-NEXT: Classifying expressions for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT: %i4 = ptrtoint ptr %arg to i64
+; X32-NEXT: --> (zext i32 (ptrtoint ptr %arg to i32) to i64) U: [0,4294967296) S: [0,4294967296)
+; X32-NEXT: %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+; X32-NEXT: --> {%arg,+,1}<nuw><%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i8 = load i8, ptr %i7, align 1
+; X32-NEXT: --> %i8 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i9 = ptrtoint ptr %i7 to i64
+; X32-NEXT: --> {(zext i32 (ptrtoint ptr %arg to i32) to i64),+,1}<nuw><%bb6> U: [0,8589934591) S: [0,8589934591) Exits: ((zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) + (zext i32 (ptrtoint ptr %arg to i32) to i64)) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i10 = sub i64 %i9, %i4
+; X32-NEXT: --> {0,+,1}<nuw><%bb6> U: [0,4294967296) S: [0,4294967296) Exits: (zext i32 (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32)) to i64) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+; X32-NEXT: --> {%arg2,+,1}<%bb6> U: full-set S: full-set Exits: (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg2) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: %i12 = load i8, ptr %i11, align 1
+; X32-NEXT: --> %i12 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i13 = add i8 %i12, %i8
+; X32-NEXT: --> (%i12 + %i8) U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb6: Variant }
+; X32-NEXT: %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+; X32-NEXT: --> {(1 + %arg),+,1}<nuw><%bb6> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32) + %arg) LoopDispositions: { %bb6: Computable }
+; X32-NEXT: Determining loop execution counts for: @pr46786_c26_char_cmp_ops_swapped
+; X32-NEXT: Loop %bb6: backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT: Loop %bb6: constant max backedge-taken count is i32 -1
+; X32-NEXT: Loop %bb6: symbolic max backedge-taken count is (-1 + (-1 * (ptrtoint ptr %arg to i32)) + (ptrtoint ptr %arg1 to i32))
+; X32-NEXT: Loop %bb6: Trip multiple is 1
+;
+ %i = icmp eq ptr %arg1, %arg
+ br i1 %i, label %bb5, label %bb3
+
+bb3:
+ %i4 = ptrtoint ptr %arg to i64
+ br label %bb6
+
+bb6:
+ %i7 = phi ptr [ %arg, %bb3 ], [ %i14, %bb6 ]
+ %i8 = load i8, ptr %i7
+ %i9 = ptrtoint ptr %i7 to i64
+ %i10 = sub i64 %i9, %i4
+ %i11 = getelementptr inbounds i8, ptr %arg2, i64 %i10
+ %i12 = load i8, ptr %i11
+ %i13 = add i8 %i12, %i8
+ store i8 %i13, ptr %i11
+ %i14 = getelementptr inbounds i8, ptr %i7, i64 1
+ %i15 = icmp eq ptr %i14, %arg1
+ br i1 %i15, label %bb5, label %bb6
+
+bb5:
+ ret void
+}
+
+
; void pr46786_c26_int(int* start, int *end, int *other) {
; for (int* cur = start; cur != end; ++cur)
; other[cur - start] += *cur;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
index 8552931..ee35447 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-ashr.mir
@@ -102,8 +102,8 @@ body: |
; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
%0:_(<4 x s16>) = COPY $d0
- %2:_(s16) = COPY $h0
- %1:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
%3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
%4:_(<4 x s16>) = G_ASHR %0, %3
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
index 61d1c43..97bcb80 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-shl.mir
@@ -135,8 +135,8 @@ body: |
; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
%0:_(<4 x s16>) = COPY $d0
- %2:_(s16) = COPY $h0
- %1:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
%3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
%4:_(<4 x s16>) = G_SHL %0, %3
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
new file mode 100644
index 0000000..332049d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/knownbits-sub.mir
@@ -0,0 +1,276 @@
+# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=aarch64 -passes="print<gisel-value-tracking>" -filetype=null %s 2>&1 | FileCheck %s
+
+---
+name: Cst
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @Cst
+ ; CHECK-NEXT: %0:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %1:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %2:_ KnownBits:00100010 SignBits:2
+ %0:_(s8) = G_CONSTANT i8 2
+ %1:_(s8) = G_CONSTANT i8 224
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstZero
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:00000000 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 0
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000001 SignBits:7
+ ; CHECK-NEXT: %2:_ KnownBits:11111111 SignBits:8
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 1
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNegFour
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNegFour
+ ; CHECK-NEXT: %0:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %1:_ KnownBits:00000100 SignBits:5
+ ; CHECK-NEXT: %2:_ KnownBits:11111100 SignBits:6
+ %0:_(s8) = G_CONSTANT i8 0
+ %1:_(s8) = G_CONSTANT i8 4
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: CstNeg
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @CstNeg
+ ; CHECK-NEXT: %0:_ KnownBits:11100000 SignBits:3
+ ; CHECK-NEXT: %1:_ KnownBits:00000010 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:11011110 SignBits:2
+ %0:_(s8) = G_CONSTANT i8 224
+ %1:_(s8) = G_CONSTANT i8 2
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = COPY $b1
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_SUB %0, %1
+...
+---
+name: ScalarNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:00000000 SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:4
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 0
+ %4:_(s8) = G_SUB %3, %2
+...
+---
+name: ScalarLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00000011 SignBits:6
+ ; CHECK-NEXT: %2:_ KnownBits:???????? SignBits:1
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 3
+ %2:_(s8) = G_SUB %1, %0
+...
+---
+name: ScalarPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @ScalarPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:00001111 SignBits:4
+ ; CHECK-NEXT: %2:_ KnownBits:0000???? SignBits:4
+ ; CHECK-NEXT: %3:_ KnownBits:00000101 SignBits:5
+ ; CHECK-NEXT: %4:_ KnownBits:???????? SignBits:3
+ %0:_(s8) = COPY $b0
+ %1:_(s8) = G_CONSTANT i8 15
+ %2:_(s8) = G_AND %0, %1
+ %3:_(s8) = G_CONSTANT i8 5
+ %4:_(s8) = G_SUB %2, %3
+...
+---
+name: VectorCstZero
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstZero
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000000 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 0
+ %1:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_SUB %1, %2
+...
+---
+name: VectorCstNegOne
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCstNegOne
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000001 SignBits:15
+ ; CHECK-NEXT: %4:_ KnownBits:1111111111111111 SignBits:16
+ %0:_(s16) = G_CONSTANT i16 0
+ %1:_(s16) = G_CONSTANT i16 1
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %0, %0, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %4:_(<4 x s16>) = G_SUB %2, %3
+...
+---
+name: VectorVar
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorVar
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(<4 x s16>) = COPY $d1
+ %2:_(<4 x s16>) = G_SUB %0, %1
+...
+---
+name: VectorRhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorRhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_SUB %2, %0
+...
+---
+name: VectorNonNegative
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorNonNegative
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %5:_ KnownBits:0000000000000000 SignBits:16
+ ; CHECK-NEXT: %6:_ KnownBits:???????????????? SignBits:8
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 0
+ %5:_(<4 x s16>) = G_BUILD_VECTOR %4, %4, %4, %4
+ %6:_(<4 x s16>) = G_SUB %5, %3
+...
+---
+name: VectorLhsEarlyOut
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorLhsEarlyOut
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 3
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_SUB %0, %2
+...
+---
+name: VectorPartKnown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorPartKnown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %2:_ KnownBits:0000000011111111 SignBits:8
+ ; CHECK-NEXT: %3:_ KnownBits:00000000???????? SignBits:8
+ ; CHECK-NEXT: %4:_ KnownBits:0000000000101010 SignBits:10
+ ; CHECK-NEXT: %5:_ KnownBits:0000000001001010 SignBits:9
+ ; CHECK-NEXT: %6:_ KnownBits:000000000??01010 SignBits:9
+ ; CHECK-NEXT: %7:_ KnownBits:???????????????? SignBits:7
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = G_CONSTANT i16 255
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %1, %1, %1, %1
+ %3:_(<4 x s16>) = G_AND %0, %2
+ %4:_(s16) = G_CONSTANT i16 42
+ %5:_(s16) = G_CONSTANT i16 74
+ %6:_(<4 x s16>) = G_BUILD_VECTOR %4, %5, %5, %4
+ %7:_(<4 x s16>) = G_SUB %6, %3
+...
+---
+name: VectorCst36
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst36
+ ; CHECK-NEXT: %0:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %1:_ KnownBits:0000000000000110 SignBits:13
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %3:_ KnownBits:0000000000000?1? SignBits:13
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:12
+ %0:_(s16) = G_CONSTANT i16 3
+ %1:_(s16) = G_CONSTANT i16 6
+ %2:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %0, %1, %1, %0
+ %4:_(<4 x s16>) = G_SUB %2, %3
+...
+
+---
+name: VectorCst3unknown
+body: |
+ bb.1:
+ ; CHECK-LABEL: name: @VectorCst3unknown
+ ; CHECK-NEXT: %0:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %1:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %2:_ KnownBits:0000000000000011 SignBits:14
+ ; CHECK-NEXT: %3:_ KnownBits:???????????????? SignBits:1
+ ; CHECK-NEXT: %4:_ KnownBits:???????????????? SignBits:1
+ %0:_(<4 x s16>) = COPY $d0
+ %1:_(s16) = COPY $h0
+ %2:_(s16) = G_CONSTANT i16 3
+ %3:_(<4 x s16>) = G_BUILD_VECTOR %1, %2, %2, %1
+ %4:_(<4 x s16>) = G_SUB %0, %3
+...
diff --git a/llvm/test/CodeGen/AArch64/adds_cmn.ll b/llvm/test/CodeGen/AArch64/adds_cmn.ll
index aa070b7..9b456a5 100644
--- a/llvm/test/CodeGen/AArch64/adds_cmn.ll
+++ b/llvm/test/CodeGen/AArch64/adds_cmn.ll
@@ -22,10 +22,8 @@ entry:
define { i32, i32 } @adds_cmn_c(i32 noundef %x, i32 noundef %y) {
; CHECK-LABEL: adds_cmn_c:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: cmn w0, w1
-; CHECK-NEXT: add w1, w1, w0
-; CHECK-NEXT: cset w8, lo
-; CHECK-NEXT: mov w0, w8
+; CHECK-NEXT: adds w1, w0, w1
+; CHECK-NEXT: cset w0, lo
; CHECK-NEXT: ret
entry:
%0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %x, i32 %y)
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
index 5933c5d..b8302e6 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-win.mir
@@ -380,10 +380,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 0 :: (load (s16) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SavePReg 4, 0
; CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 1 :: (load (s16) from %stack.3)
@@ -430,10 +428,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.1)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 0
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
@@ -557,10 +553,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
; CHECK-NEXT: $x21, $lr = frame-destroy LDPXi $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
; CHECK-NEXT: frame-destroy SEH_SaveRegP 21, 30, 16
- ; CHECK-NEXT: $x19, $x20 = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
- ; CHECK-NEXT: frame-destroy SEH_SaveRegP 19, 20, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 32
+ ; CHECK-NEXT: early-clobber $sp, $x19, $x20 = frame-destroy LDPXpost $sp, 4 :: (load (s64) from %stack.4), (load (s64) from %stack.5)
+ ; CHECK-NEXT: frame-destroy SEH_SaveRegP_X 19, 20, -32
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.21)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.20)
@@ -745,10 +739,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
; CHECK-NEXT: frame-destroy SEH_SetFP
- ; CHECK-NEXT: $fp, $lr = frame-destroy LDPXi $sp, 0 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
- ; CHECK-NEXT: frame-destroy SEH_SaveFPLR 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.2), (load (s64) from %stack.3)
+ ; CHECK-NEXT: frame-destroy SEH_SaveFPLR_X -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.19)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 2
; CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 3 :: (load (s128) from %stack.18)
@@ -869,10 +861,8 @@ body: |
; CHECK-NEXT: frame-destroy SEH_EpilogStart
; CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 7, implicit $vg
; CHECK-NEXT: frame-destroy SEH_AllocZ 7
- ; CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 0 :: (load (s64) from %stack.6)
- ; CHECK-NEXT: frame-destroy SEH_SaveReg 30, 0
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0
- ; CHECK-NEXT: frame-destroy SEH_StackAlloc 16
+ ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.6)
+ ; CHECK-NEXT: frame-destroy SEH_SaveReg_X 30, -16
; CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.8)
; CHECK-NEXT: frame-destroy SEH_SaveZReg 8, 1
; CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.7)
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index ecd48d6..149b4c4 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -290,8 +290,7 @@ define i32 @unsigned_sat_variable_i32_using_cmp_sum(i32 %x, i32 %y) {
define i32 @unsigned_sat_variable_i32_using_cmp_notval(i32 %x, i32 %y) {
; CHECK-LABEL: unsigned_sat_variable_i32_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: add w8, w0, w1
-; CHECK-NEXT: cmn w1, w0
+; CHECK-NEXT: adds w8, w1, w0
; CHECK-NEXT: csinv w0, w8, wzr, lo
; CHECK-NEXT: ret
%noty = xor i32 %y, -1
@@ -331,8 +330,7 @@ define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) {
define i64 @unsigned_sat_variable_i64_using_cmp_notval(i64 %x, i64 %y) {
; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_notval:
; CHECK: // %bb.0:
-; CHECK-NEXT: add x8, x0, x1
-; CHECK-NEXT: cmn x1, x0
+; CHECK-NEXT: adds x8, x1, x0
; CHECK-NEXT: csinv x0, x8, xzr, lo
; CHECK-NEXT: ret
%noty = xor i64 %y, -1
diff --git a/llvm/test/CodeGen/AArch64/win-sve.ll b/llvm/test/CodeGen/AArch64/win-sve.ll
index 53ac934..3ba4a1c 100644
--- a/llvm/test/CodeGen/AArch64/win-sve.ll
+++ b/llvm/test/CodeGen/AArch64/win-sve.ll
@@ -75,10 +75,8 @@ define i32 @f(<vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -234,10 +232,8 @@ define void @f2(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -384,10 +380,8 @@ define void @f3(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -538,10 +532,8 @@ define void @f4(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -702,10 +694,8 @@ define void @f5(i64 %n, <vscale x 2 x i64> %x) {
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -860,10 +850,10 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: stur x0, [x8, #16]
; CHECK-NEXT: addvl x8, x29, #18
; CHECK-NEXT: ldr x1, [x8, #32]
-; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: .Ltmp0: // EH_LABEL
; CHECK-NEXT: add x0, x19, #0
; CHECK-NEXT: bl g6
-; CHECK-NEXT: .Ltmp1:
+; CHECK-NEXT: .Ltmp1: // EH_LABEL
; CHECK-NEXT: // %bb.1: // %invoke.cont
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: add sp, sp, #64
@@ -872,10 +862,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -932,8 +920,6 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_preg p14, 10
; CHECK-NEXT: ldr p15, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: .seh_save_preg p15, 11
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .seh_allocz 18
; CHECK-NEXT: add sp, sp, #16
@@ -1024,10 +1010,8 @@ define void @f6(<vscale x 2 x i64> %x, [8 x i64] %pad, i64 %n9) personality ptr
; CHECK-NEXT: .seh_save_fplr 16
; CHECK-NEXT: ldr x28, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x28, 8
-; CHECK-NEXT: ldr x19, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x19, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x19, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x19, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1144,10 +1128,8 @@ define void @f8(<vscale x 2 x i64> %v) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1196,14 +1178,10 @@ define void @f9(<vscale x 2 x i64> %v, ...) {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: .seh_stackalloc 64
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: add sp, sp, #64
@@ -1301,10 +1279,8 @@ define void @f10(i64 %n, <vscale x 2 x i64> %x) "frame-pointer"="all" {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 2
; CHECK-NEXT: ldr z9, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1390,10 +1366,8 @@ define i32 @f11(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: str d0, [sp, #8]
; CHECK-NEXT: .seh_startepilogue
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1431,10 +1405,8 @@ define i32 @f12(double %d, <vscale x 4 x i32> %vs) "aarch64_pstate_sm_compatible
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .seh_allocz 1
-; CHECK-NEXT: ldr x30, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x30, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x30, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1475,10 +1447,8 @@ define i32 @f13(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_startepilogue
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1521,10 +1491,8 @@ define i32 @f14(double %d, <vscale x 4 x i32> %vs) "frame-pointer"="all" {
; CHECK-NEXT: .seh_allocz 1
; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_fplr 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #32
-; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr x28, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 32
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1572,10 +1540,8 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
; CHECK-NEXT: .seh_stackalloc 16
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
; CHECK-NEXT: .seh_save_reg x30, 8
-; CHECK-NEXT: ldr x28, [sp] // 8-byte Folded Reload
-; CHECK-NEXT: .seh_save_reg x28, 0
-; CHECK-NEXT: add sp, sp, #16
-; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldr x28, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: .seh_save_reg_x x28, 16
; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
; CHECK-NEXT: .seh_save_zreg z8, 0
; CHECK-NEXT: addvl sp, sp, #1
@@ -1594,3 +1560,53 @@ define tailcc void @f15(double %d, <vscale x 4 x i32> %vs, [9 x i64], i32 %i) {
store i32 %i, ptr %a
ret void
}
+
+declare ptr @llvm.swift.async.context.addr()
+
+define void @f16(ptr swiftasync %ctx, <vscale x 2 x i64> %foo) {
+; CHECK-LABEL: f16:
+; CHECK: .seh_proc f16
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: orr x29, x29, #0x1000000000000000
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: str x22, [sp]
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: add x29, sp, #8
+; CHECK-NEXT: .seh_add_fp 8
+; CHECK-NEXT: .seh_endprologue
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: ldr x8, [x22]
+; CHECK-NEXT: stur x8, [x29, #-8]
+; CHECK-NEXT: .seh_startepilogue
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: .seh_stackalloc 16
+; CHECK-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_fplr 8
+; CHECK-NEXT: add sp, sp, #32
+; CHECK-NEXT: .seh_stackalloc 32
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: .seh_save_zreg z8, 0
+; CHECK-NEXT: and x29, x29, #0xefffffffffffffff
+; CHECK-NEXT: .seh_nop
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: .seh_allocz 1
+; CHECK-NEXT: .seh_endepilogue
+; CHECK-NEXT: ret
+; CHECK-NEXT: .seh_endfunclet
+; CHECK-NEXT: .seh_endproc
+ tail call void asm sideeffect "", "~{z8}"()
+ %1 = load ptr, ptr %ctx, align 8
+ %2 = tail call ptr @llvm.swift.async.context.addr()
+ store ptr %1, ptr %2, align 8
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 549af87..a43bfb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -1047,7 +1047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB9_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else20
; CI-NEXT: s_and_b32 s2, s0, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -1058,7 +1058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1083,10 +1083,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: .LBB9_5: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1102,7 +1102,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_7
; CI-NEXT: .LBB9_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1125,7 +1125,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB9_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_and_b32 s4, s2, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -1136,7 +1136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s4, s4, 1
; CI-NEXT: s_cmp_lg_u32 s4, 0
; CI-NEXT: s_cbranch_scc1 .LBB9_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1161,10 +1161,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: .LBB9_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1180,7 +1180,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_15
; CI-NEXT: .LBB9_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1237,7 +1237,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; VI-NEXT: s_cbranch_vccz .LBB9_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else20
; VI-NEXT: s_and_b32 s2, s0, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -1248,7 +1248,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute19
; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; VI-NEXT: v_ldexp_f32 v1, v3, 1
@@ -1273,10 +1273,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: .LBB9_5: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1292,7 +1292,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_7
; VI-NEXT: .LBB9_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1315,7 +1315,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; VI-NEXT: s_cbranch_vccz .LBB9_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_and_b32 s3, s4, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1326,7 +1326,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s3, s3, 1
; VI-NEXT: s_cmp_lg_u32 s3, 0
; VI-NEXT: s_cbranch_scc1 .LBB9_16
-; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; VI-NEXT: v_ldexp_f32 v2, v4, 1
@@ -1351,10 +1351,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: .LBB9_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1370,7 +1370,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_15
; VI-NEXT: .LBB9_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1425,7 +1425,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s2|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; CI-NEXT: s_cbranch_vccz .LBB10_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else86
; CI-NEXT: s_and_b32 s0, s4, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1436,7 +1436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s0, s0, 1
; CI-NEXT: s_cmp_lg_u32 s0, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute85
; CI-NEXT: v_frexp_mant_f32_e32 v3, v1
; CI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; CI-NEXT: v_ldexp_f32_e64 v1, v3, 1
@@ -1461,10 +1461,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 11, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: .LBB10_5: ; %frem.loop_body93
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1480,7 +1480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_7
; CI-NEXT: .LBB10_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit94
; CI-NEXT: v_add_i32_e32 v2, vcc, -10, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1503,7 +1503,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; CI-NEXT: s_cbranch_vccz .LBB10_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else53
; CI-NEXT: s_and_b32 s1, s6, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1514,7 +1514,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_16
-; CI-NEXT: ; %bb.11: ; %frem.compute19
+; CI-NEXT: ; %bb.11: ; %frem.compute52
; CI-NEXT: v_frexp_mant_f32_e32 v4, v2
; CI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; CI-NEXT: v_ldexp_f32_e64 v2, v4, 1
@@ -1539,10 +1539,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 11, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: .LBB10_13: ; %frem.loop_body60
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1558,7 +1558,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_15
; CI-NEXT: .LBB10_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit61
; CI-NEXT: v_add_i32_e32 v3, vcc, -10, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1579,7 +1579,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr2
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; CI-NEXT: s_cbranch_vccz .LBB10_18
-; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: ; %bb.17: ; %frem.else20
; CI-NEXT: s_and_b32 s1, s5, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
; CI-NEXT: v_mov_b32_e32 v2, s1
@@ -1590,7 +1590,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_24
-; CI-NEXT: ; %bb.19: ; %frem.compute52
+; CI-NEXT: ; %bb.19: ; %frem.compute19
; CI-NEXT: v_frexp_mant_f32_e32 v5, v3
; CI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
; CI-NEXT: v_ldexp_f32_e64 v3, v5, 1
@@ -1615,10 +1615,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_22
-; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; CI-NEXT: v_add_i32_e32 v4, vcc, 11, v7
; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
-; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: .LBB10_21: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v6
; CI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -1634,7 +1634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_23
; CI-NEXT: .LBB10_22:
; CI-NEXT: v_mov_b32_e32 v7, v6
-; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v4, vcc, -10, v4
; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
; CI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -1657,7 +1657,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; implicit-def: $vgpr3
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
; CI-NEXT: s_cbranch_vccz .LBB10_26
-; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_and_b32 s1, s7, 0x8000
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1668,7 +1668,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s1, s1, 1
; CI-NEXT: s_cmp_lg_u32 s1, 0
; CI-NEXT: s_cbranch_scc1 .LBB10_32
-; CI-NEXT: ; %bb.27: ; %frem.compute85
+; CI-NEXT: ; %bb.27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e32 v6, v4
; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
; CI-NEXT: v_ldexp_f32_e64 v4, v6, 1
@@ -1693,10 +1693,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_30
-; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v5, vcc, 11, v8
; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
-; CI-NEXT: .LBB10_29: ; %frem.loop_body93
+; CI-NEXT: .LBB10_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v8, v7
; CI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -1712,7 +1712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_31
; CI-NEXT: .LBB10_30:
; CI-NEXT: v_mov_b32_e32 v8, v7
-; CI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v5, vcc, -10, v5
; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
; CI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -1791,7 +1791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v1, |s6|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v2, v1
; VI-NEXT: s_cbranch_vccz .LBB10_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else86
; VI-NEXT: s_and_b32 s0, s8, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v2, v1
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1802,7 +1802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute85
; VI-NEXT: v_frexp_mant_f32_e32 v3, v1
; VI-NEXT: v_frexp_exp_i32_f32_e32 v6, v1
; VI-NEXT: v_ldexp_f32 v1, v3, 1
@@ -1827,10 +1827,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 11, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: .LBB10_5: ; %frem.loop_body93
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -1846,7 +1846,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_7
; VI-NEXT: .LBB10_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit94
; VI-NEXT: v_add_u32_e32 v2, vcc, -10, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -1869,7 +1869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v3, v2
; VI-NEXT: s_cbranch_vccz .LBB10_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else53
; VI-NEXT: s_and_b32 s0, s4, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v3, v2
; VI-NEXT: v_mov_b32_e32 v1, s0
@@ -1880,7 +1880,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_16
-; VI-NEXT: ; %bb.11: ; %frem.compute19
+; VI-NEXT: ; %bb.11: ; %frem.compute52
; VI-NEXT: v_frexp_mant_f32_e32 v4, v2
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v2
; VI-NEXT: v_ldexp_f32 v2, v4, 1
@@ -1905,10 +1905,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 11, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: .LBB10_13: ; %frem.loop_body60
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -1924,7 +1924,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_15
; VI-NEXT: .LBB10_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit61
; VI-NEXT: v_add_u32_e32 v3, vcc, -10, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -1945,7 +1945,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr2
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; VI-NEXT: s_cbranch_vccz .LBB10_18
-; VI-NEXT: ; %bb.17: ; %frem.else53
+; VI-NEXT: ; %bb.17: ; %frem.else20
; VI-NEXT: s_and_b32 s0, s9, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -1956,7 +1956,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_24
-; VI-NEXT: ; %bb.19: ; %frem.compute52
+; VI-NEXT: ; %bb.19: ; %frem.compute19
; VI-NEXT: v_frexp_mant_f32_e32 v5, v3
; VI-NEXT: v_frexp_exp_i32_f32_e32 v8, v3
; VI-NEXT: v_ldexp_f32 v3, v5, 1
@@ -1981,10 +1981,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v4
; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_22
-; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v7
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
-; VI-NEXT: .LBB10_21: ; %frem.loop_body60
+; VI-NEXT: .LBB10_21: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -2000,7 +2000,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_23
; VI-NEXT: .LBB10_22:
; VI-NEXT: v_mov_b32_e32 v7, v6
-; VI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
; VI-NEXT: v_ldexp_f32 v4, v7, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -2023,7 +2023,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v5, v4
; VI-NEXT: s_cbranch_vccz .LBB10_26
-; VI-NEXT: ; %bb.25: ; %frem.else86
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_and_b32 s0, s12, 0x8000
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v4
; VI-NEXT: v_mov_b32_e32 v3, s0
@@ -2034,7 +2034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s0, s0, 1
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cbranch_scc1 .LBB10_32
-; VI-NEXT: ; %bb.27: ; %frem.compute85
+; VI-NEXT: ; %bb.27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e32 v6, v4
; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v4
; VI-NEXT: v_ldexp_f32 v4, v6, 1
@@ -2059,10 +2059,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 11, v5
; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_30
-; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v5, vcc, 11, v8
; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
-; VI-NEXT: .LBB10_29: ; %frem.loop_body93
+; VI-NEXT: .LBB10_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -2078,7 +2078,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_31
; VI-NEXT: .LBB10_30:
; VI-NEXT: v_mov_b32_e32 v8, v7
-; VI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v5, vcc, -10, v5
; VI-NEXT: v_ldexp_f32 v5, v8, v5
; VI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -2144,7 +2144,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_cbranch_vccz .LBB11_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: s_and_b32 s6, s2, 0x80000000
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2156,7 +2156,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s6, s6, 1
; CI-NEXT: s_cmp_lg_u32 s6, 0
; CI-NEXT: s_cbranch_scc1 .LBB11_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
; CI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
@@ -2181,10 +2181,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: .LBB11_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2200,7 +2200,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_7
; CI-NEXT: .LBB11_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2219,7 +2219,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s6, 1
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_cbranch_vccz .LBB11_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_and_b32 s6, s3, 0x80000000
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2231,7 +2231,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s6, s6, 1
; CI-NEXT: s_cmp_lg_u32 s6, 0
; CI-NEXT: s_cbranch_scc1 .LBB11_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
; CI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
@@ -2256,10 +2256,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: .LBB11_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2275,7 +2275,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_15
; CI-NEXT: .LBB11_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2317,7 +2317,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s2|, |v0|
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_cbranch_vccz .LBB11_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: s_and_b32 s6, s2, 0x80000000
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2329,7 +2329,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s6, s6, 1
; VI-NEXT: s_cmp_lg_u32 s6, 0
; VI-NEXT: s_cbranch_scc1 .LBB11_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; VI-NEXT: v_ldexp_f32 v1, v1, 1
; VI-NEXT: v_div_scale_f32 v3, s[6:7], v1, v1, 1.0
@@ -2354,10 +2354,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: .LBB11_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2373,7 +2373,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_7
; VI-NEXT: .LBB11_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2392,7 +2392,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s6, 1
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_cbranch_vccz .LBB11_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_and_b32 s6, s3, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2404,7 +2404,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s6, s6, 1
; VI-NEXT: s_cmp_lg_u32 s6, 0
; VI-NEXT: s_cbranch_scc1 .LBB11_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v2, |s5|
; VI-NEXT: v_ldexp_f32 v2, v2, 1
; VI-NEXT: v_div_scale_f32 v4, s[6:7], v2, v2, 1.0
@@ -2429,10 +2429,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: .LBB11_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2448,7 +2448,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_15
; VI-NEXT: .LBB11_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2498,7 +2498,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
; CI-NEXT: ; implicit-def: $vgpr0
; CI-NEXT: s_cbranch_vccz .LBB12_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else78
; CI-NEXT: s_and_b32 s2, s4, 0x80000000
; CI-NEXT: v_mov_b32_e32 v1, s8
; CI-NEXT: v_mov_b32_e32 v0, s4
@@ -2510,7 +2510,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute77
; CI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
; CI-NEXT: v_ldexp_f32_e64 v1, v1, 1
; CI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
@@ -2535,10 +2535,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; CI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; CI-NEXT: v_add_i32_e32 v2, vcc, 12, v5
; CI-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
-; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: .LBB12_5: ; %frem.loop_body85
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v5, v4
; CI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2554,7 +2554,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_7
; CI-NEXT: .LBB12_6:
; CI-NEXT: v_mov_b32_e32 v5, v4
-; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit86
; CI-NEXT: v_add_i32_e32 v2, vcc, -11, v2
; CI-NEXT: v_ldexp_f32_e32 v2, v5, v2
; CI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2573,7 +2573,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr1
; CI-NEXT: s_cbranch_vccz .LBB12_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else47
; CI-NEXT: s_and_b32 s2, s5, 0x80000000
; CI-NEXT: v_mov_b32_e32 v2, s9
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2585,7 +2585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute46
; CI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; CI-NEXT: v_ldexp_f32_e64 v2, v2, 1
; CI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
@@ -2610,10 +2610,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; CI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; CI-NEXT: v_add_i32_e32 v3, vcc, 12, v6
; CI-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
-; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: .LBB12_13: ; %frem.loop_body54
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v6, v5
; CI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2629,7 +2629,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_15
; CI-NEXT: .LBB12_14:
; CI-NEXT: v_mov_b32_e32 v6, v5
-; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit55
; CI-NEXT: v_add_i32_e32 v3, vcc, -11, v3
; CI-NEXT: v_ldexp_f32_e32 v3, v6, v3
; CI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2648,7 +2648,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr2
; CI-NEXT: s_cbranch_vccz .LBB12_18
-; CI-NEXT: ; %bb.17: ; %frem.else47
+; CI-NEXT: ; %bb.17: ; %frem.else16
; CI-NEXT: s_and_b32 s2, s6, 0x80000000
; CI-NEXT: v_mov_b32_e32 v3, s10
; CI-NEXT: v_mov_b32_e32 v2, s6
@@ -2660,7 +2660,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_24
-; CI-NEXT: ; %bb.19: ; %frem.compute46
+; CI-NEXT: ; %bb.19: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
; CI-NEXT: v_ldexp_f32_e64 v3, v3, 1
; CI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
@@ -2685,10 +2685,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
; CI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_22
-; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v4, vcc, 12, v7
; CI-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
-; CI-NEXT: .LBB12_21: ; %frem.loop_body54
+; CI-NEXT: .LBB12_21: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v6
; CI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -2704,7 +2704,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_23
; CI-NEXT: .LBB12_22:
; CI-NEXT: v_mov_b32_e32 v7, v6
-; CI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v4, vcc, -11, v4
; CI-NEXT: v_ldexp_f32_e32 v4, v7, v4
; CI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -2723,7 +2723,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr3
; CI-NEXT: s_cbranch_vccz .LBB12_26
-; CI-NEXT: ; %bb.25: ; %frem.else78
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_and_b32 s2, s7, 0x80000000
; CI-NEXT: v_mov_b32_e32 v4, s11
; CI-NEXT: v_mov_b32_e32 v3, s7
@@ -2735,7 +2735,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB12_32
-; CI-NEXT: ; %bb.27: ; %frem.compute77
+; CI-NEXT: ; %bb.27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
; CI-NEXT: v_ldexp_f32_e64 v4, v4, 1
; CI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
@@ -2760,10 +2760,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
; CI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_30
-; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v5, vcc, 12, v8
; CI-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
-; CI-NEXT: .LBB12_29: ; %frem.loop_body85
+; CI-NEXT: .LBB12_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v8, v7
; CI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -2779,7 +2779,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_31
; CI-NEXT: .LBB12_30:
; CI-NEXT: v_mov_b32_e32 v8, v7
-; CI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v5, vcc, -11, v5
; CI-NEXT: v_ldexp_f32_e32 v5, v8, v5
; CI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -2829,7 +2829,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 vcc, |s4|, |v0|
; VI-NEXT: ; implicit-def: $vgpr0
; VI-NEXT: s_cbranch_vccz .LBB12_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else78
; VI-NEXT: s_and_b32 s2, s4, 0x80000000
; VI-NEXT: v_mov_b32_e32 v1, s8
; VI-NEXT: v_mov_b32_e32 v0, s4
@@ -2841,7 +2841,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute77
; VI-NEXT: v_frexp_mant_f32_e64 v1, |s8|
; VI-NEXT: v_ldexp_f32 v1, v1, 1
; VI-NEXT: v_div_scale_f32 v3, s[2:3], v1, v1, 1.0
@@ -2866,10 +2866,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v2
; VI-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; VI-NEXT: v_add_u32_e32 v2, vcc, 12, v5
; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v6
-; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: .LBB12_5: ; %frem.loop_body85
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v5, v4
; VI-NEXT: v_mul_f32_e32 v4, v5, v3
@@ -2885,7 +2885,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_7
; VI-NEXT: .LBB12_6:
; VI-NEXT: v_mov_b32_e32 v5, v4
-; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit86
; VI-NEXT: v_add_u32_e32 v2, vcc, -11, v2
; VI-NEXT: v_ldexp_f32 v2, v5, v2
; VI-NEXT: v_mul_f32_e32 v3, v2, v3
@@ -2904,7 +2904,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr1
; VI-NEXT: s_cbranch_vccz .LBB12_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else47
; VI-NEXT: s_and_b32 s2, s5, 0x80000000
; VI-NEXT: v_mov_b32_e32 v2, s9
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2916,7 +2916,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute46
; VI-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; VI-NEXT: v_ldexp_f32 v2, v2, 1
; VI-NEXT: v_div_scale_f32 v4, s[2:3], v2, v2, 1.0
@@ -2941,10 +2941,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v3
; VI-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; VI-NEXT: v_add_u32_e32 v3, vcc, 12, v6
; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v7
-; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: .LBB12_13: ; %frem.loop_body54
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v6, v5
; VI-NEXT: v_mul_f32_e32 v5, v6, v4
@@ -2960,7 +2960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_15
; VI-NEXT: .LBB12_14:
; VI-NEXT: v_mov_b32_e32 v6, v5
-; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit55
; VI-NEXT: v_add_u32_e32 v3, vcc, -11, v3
; VI-NEXT: v_ldexp_f32 v3, v6, v3
; VI-NEXT: v_mul_f32_e32 v4, v3, v4
@@ -2979,7 +2979,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr2
; VI-NEXT: s_cbranch_vccz .LBB12_18
-; VI-NEXT: ; %bb.17: ; %frem.else47
+; VI-NEXT: ; %bb.17: ; %frem.else16
; VI-NEXT: s_and_b32 s2, s6, 0x80000000
; VI-NEXT: v_mov_b32_e32 v3, s10
; VI-NEXT: v_mov_b32_e32 v2, s6
@@ -2991,7 +2991,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_24
-; VI-NEXT: ; %bb.19: ; %frem.compute46
+; VI-NEXT: ; %bb.19: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v3, |s10|
; VI-NEXT: v_ldexp_f32 v3, v3, 1
; VI-NEXT: v_div_scale_f32 v5, s[2:3], v3, v3, 1.0
@@ -3016,10 +3016,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v4
; VI-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_22
-; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v4, vcc, 12, v7
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v8
-; VI-NEXT: .LBB12_21: ; %frem.loop_body54
+; VI-NEXT: .LBB12_21: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v6
; VI-NEXT: v_mul_f32_e32 v6, v7, v5
@@ -3035,7 +3035,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_23
; VI-NEXT: .LBB12_22:
; VI-NEXT: v_mov_b32_e32 v7, v6
-; VI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v4, vcc, -11, v4
; VI-NEXT: v_ldexp_f32 v4, v7, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -3054,7 +3054,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr3
; VI-NEXT: s_cbranch_vccz .LBB12_26
-; VI-NEXT: ; %bb.25: ; %frem.else78
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_and_b32 s2, s7, 0x80000000
; VI-NEXT: v_mov_b32_e32 v4, s11
; VI-NEXT: v_mov_b32_e32 v3, s7
@@ -3066,7 +3066,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB12_32
-; VI-NEXT: ; %bb.27: ; %frem.compute77
+; VI-NEXT: ; %bb.27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v4, |s11|
; VI-NEXT: v_ldexp_f32 v4, v4, 1
; VI-NEXT: v_div_scale_f32 v6, s[2:3], v4, v4, 1.0
@@ -3091,10 +3091,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 12, v5
; VI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_30
-; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v5, vcc, 12, v8
; VI-NEXT: v_sub_u32_e32 v5, vcc, v5, v9
-; VI-NEXT: .LBB12_29: ; %frem.loop_body85
+; VI-NEXT: .LBB12_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v8, v7
; VI-NEXT: v_mul_f32_e32 v7, v8, v6
@@ -3110,7 +3110,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_31
; VI-NEXT: .LBB12_30:
; VI-NEXT: v_mov_b32_e32 v8, v7
-; VI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v5, vcc, -11, v5
; VI-NEXT: v_ldexp_f32 v5, v8, v5
; VI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -3169,7 +3169,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
; CI-NEXT: ; implicit-def: $vgpr0_vgpr1
; CI-NEXT: s_cbranch_vccz .LBB13_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: v_mov_b32_e32 v0, s8
; CI-NEXT: v_mov_b32_e32 v1, s9
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
@@ -3187,7 +3187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB13_8
-; CI-NEXT: ; %bb.3: ; %frem.compute
+; CI-NEXT: ; %bb.3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
@@ -3210,10 +3210,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_6
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_add_i32_e32 v6, vcc, 26, v6
; CI-NEXT: v_sub_i32_e32 v9, vcc, v6, v7
-; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: .LBB13_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v7, v5
; CI-NEXT: v_mov_b32_e32 v6, v4
@@ -3232,7 +3232,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: .LBB13_6:
; CI-NEXT: v_mov_b32_e32 v7, v5
; CI-NEXT: v_mov_b32_e32 v6, v4
-; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe7, v9
; CI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
; CI-NEXT: s_mov_b32 s2, 0
@@ -3256,7 +3256,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_mov_b32 s2, 1
; CI-NEXT: ; implicit-def: $vgpr2_vgpr3
; CI-NEXT: s_cbranch_vccz .LBB13_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_mov_b32_e32 v2, s10
; CI-NEXT: v_mov_b32_e32 v3, s11
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
@@ -3274,7 +3274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_xor_b32 s2, s2, 1
; CI-NEXT: s_cmp_lg_u32 s2, 0
; CI-NEXT: s_cbranch_scc1 .LBB13_16
-; CI-NEXT: ; %bb.11: ; %frem.compute15
+; CI-NEXT: ; %bb.11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
; CI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
@@ -3297,10 +3297,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
; CI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_14
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_add_i32_e32 v8, vcc, 26, v8
; CI-NEXT: v_sub_i32_e32 v11, vcc, v8, v9
-; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: .LBB13_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mov_b32_e32 v8, v6
@@ -3319,7 +3319,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: .LBB13_14:
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mov_b32_e32 v8, v6
-; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v6, vcc, 0xffffffe7, v11
; CI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
; CI-NEXT: s_mov_b32 s2, 0
@@ -3371,7 +3371,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 vcc, |s[4:5]|, |v[0:1]|
; VI-NEXT: ; implicit-def: $vgpr0_vgpr1
; VI-NEXT: s_cbranch_vccz .LBB13_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[4:5]|, |v[0:1]|
@@ -3389,7 +3389,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB13_8
-; VI-NEXT: ; %bb.3: ; %frem.compute
+; VI-NEXT: ; %bb.3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f64_e64 v[0:1], |s[4:5]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v6, |s[4:5]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v7, |s[8:9]|
@@ -3412,10 +3412,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v9
; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_6
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_add_u32_e32 v6, vcc, 26, v6
; VI-NEXT: v_sub_u32_e32 v9, vcc, v6, v7
-; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: .LBB13_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mov_b32_e32 v6, v4
@@ -3434,7 +3434,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: .LBB13_6:
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mov_b32_e32 v6, v4
-; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe7, v9
; VI-NEXT: v_ldexp_f64 v[4:5], v[6:7], v4
; VI-NEXT: s_mov_b32 s2, 0
@@ -3458,7 +3458,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_mov_b32 s2, 1
; VI-NEXT: ; implicit-def: $vgpr2_vgpr3
; VI-NEXT: s_cbranch_vccz .LBB13_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |s[6:7]|, |v[2:3]|
@@ -3476,7 +3476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_xor_b32 s2, s2, 1
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_cbranch_scc1 .LBB13_16
-; VI-NEXT: ; %bb.11: ; %frem.compute15
+; VI-NEXT: ; %bb.11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f64_e64 v[2:3], |s[6:7]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v8, |s[6:7]|
; VI-NEXT: v_frexp_exp_i32_f64_e64 v9, |s[10:11]|
@@ -3499,10 +3499,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ge_i32_e32 vcc, 26, v11
; VI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_14
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_add_u32_e32 v8, vcc, 26, v8
; VI-NEXT: v_sub_u32_e32 v11, vcc, v8, v9
-; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: .LBB13_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v8, v6
@@ -3521,7 +3521,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: .LBB13_14:
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mov_b32_e32 v8, v6
-; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v6, vcc, 0xffffffe7, v11
; VI-NEXT: v_ldexp_f64 v[6:7], v[8:9], v6
; VI-NEXT: s_mov_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
index f96a6f7..b239c46 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll
@@ -1,13 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefix=GCN %s
-; GCN-LABEL: {{^}}kernel_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
+; GCN-LABEL: kernel_ieee_mode_default:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 1
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -15,14 +91,89 @@ define amdgpu_kernel void @kernel_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}kernel_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
+; GCN-LABEL: kernel_ieee_mode_on:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 1
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -30,14 +181,87 @@ define amdgpu_kernel void @kernel_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}kernel_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
+; GCN-LABEL: kernel_ieee_mode_off:
+; GCN: .amd_kernel_code_t
+; GCN-NEXT: amd_code_version_major = 1
+; GCN-NEXT: amd_code_version_minor = 2
+; GCN-NEXT: amd_machine_kind = 1
+; GCN-NEXT: amd_machine_version_major = 6
+; GCN-NEXT: amd_machine_version_minor = 0
+; GCN-NEXT: amd_machine_version_stepping = 0
+; GCN-NEXT: kernel_code_entry_byte_offset = 256
+; GCN-NEXT: kernel_code_prefetch_byte_size = 0
+; GCN-NEXT: granulated_workitem_vgpr_count = 0
+; GCN-NEXT: granulated_wavefront_sgpr_count = 0
+; GCN-NEXT: priority = 0
+; GCN-NEXT: float_mode = 240
+; GCN-NEXT: priv = 0
+; GCN-NEXT: enable_dx10_clamp = 1
+; GCN-NEXT: debug_mode = 0
+; GCN-NEXT: enable_ieee_mode = 0
+; GCN-NEXT: enable_wgp_mode = 0
+; GCN-NEXT: enable_mem_ordered = 0
+; GCN-NEXT: enable_fwd_progress = 0
+; GCN-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
+; GCN-NEXT: user_sgpr_count = 12
+; GCN-NEXT: enable_trap_handler = 0
+; GCN-NEXT: enable_sgpr_workgroup_id_x = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_y = 1
+; GCN-NEXT: enable_sgpr_workgroup_id_z = 1
+; GCN-NEXT: enable_sgpr_workgroup_info = 0
+; GCN-NEXT: enable_vgpr_workitem_id = 2
+; GCN-NEXT: enable_exception_msb = 0
+; GCN-NEXT: granulated_lds_size = 0
+; GCN-NEXT: enable_exception = 0
+; GCN-NEXT: enable_sgpr_private_segment_buffer = 1
+; GCN-NEXT: enable_sgpr_dispatch_ptr = 1
+; GCN-NEXT: enable_sgpr_queue_ptr = 1
+; GCN-NEXT: enable_sgpr_kernarg_segment_ptr = 1
+; GCN-NEXT: enable_sgpr_dispatch_id = 1
+; GCN-NEXT: enable_sgpr_flat_scratch_init = 0
+; GCN-NEXT: enable_sgpr_private_segment_size = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_x = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_y = 0
+; GCN-NEXT: enable_sgpr_grid_workgroup_count_z = 0
+; GCN-NEXT: enable_wavefront_size32 = 0
+; GCN-NEXT: enable_ordered_append_gds = 0
+; GCN-NEXT: private_element_size = 1
+; GCN-NEXT: is_ptr64 = 1
+; GCN-NEXT: is_dynamic_callstack = 0
+; GCN-NEXT: is_debug_enabled = 0
+; GCN-NEXT: is_xnack_enabled = 0
+; GCN-NEXT: workitem_private_segment_byte_size = 0
+; GCN-NEXT: workgroup_group_segment_byte_size = 0
+; GCN-NEXT: gds_segment_byte_size = 0
+; GCN-NEXT: kernarg_segment_byte_size = 16
+; GCN-NEXT: workgroup_fbarrier_count = 0
+; GCN-NEXT: wavefront_sgpr_count = 4
+; GCN-NEXT: workitem_vgpr_count = 2
+; GCN-NEXT: reserved_vgpr_first = 0
+; GCN-NEXT: reserved_vgpr_count = 0
+; GCN-NEXT: reserved_sgpr_first = 0
+; GCN-NEXT: reserved_sgpr_count = 0
+; GCN-NEXT: debug_wavefront_private_segment_offset_sgpr = 0
+; GCN-NEXT: debug_private_segment_buffer_sgpr = 0
+; GCN-NEXT: kernarg_segment_alignment = 4
+; GCN-NEXT: group_segment_alignment = 4
+; GCN-NEXT: private_segment_alignment = 4
+; GCN-NEXT: wavefront_size = 6
+; GCN-NEXT: call_convention = -1
+; GCN-NEXT: runtime_loader_kernel_symbol = 0
+; GCN-NEXT: .end_amd_kernel_code_t
+; GCN-NEXT: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -45,14 +269,22 @@ define amdgpu_kernel void @kernel_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_default() #0 {
+; GCN-LABEL: func_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -60,14 +292,22 @@ define void @func_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_on() #1 {
+; GCN-LABEL: func_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -75,14 +315,20 @@ define void @func_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}func_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define void @func_ieee_mode_off() #2 {
+; GCN-LABEL: func_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: s_mov_b32 s6, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[4:7], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -90,14 +336,19 @@ define void @func_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_default() #0 {
+; GCN-LABEL: cs_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -105,14 +356,21 @@ define amdgpu_cs void @cs_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_on() #1 {
+; GCN-LABEL: cs_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -120,14 +378,19 @@ define amdgpu_cs void @cs_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}cs_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_cs void @cs_ieee_mode_off() #2 {
+; GCN-LABEL: cs_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -135,14 +398,19 @@ define amdgpu_cs void @cs_ieee_mode_off() #2 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_default:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_default() #0 {
+; GCN-LABEL: ps_ieee_mode_default:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -150,14 +418,21 @@ define amdgpu_ps void @ps_ieee_mode_default() #0 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_on:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]]
-; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_on() #1 {
+; GCN-LABEL: ps_ieee_mode_on:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
@@ -165,14 +440,19 @@ define amdgpu_ps void @ps_ieee_mode_on() #1 {
ret void
}
-; GCN-LABEL: {{^}}ps_ieee_mode_off:
-; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]]
-; GCN: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]]
-; GCN-NOT: [[VAL0]]
-; GCN-NOT: [[VAL1]]
-; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[VAL0]], [[VAL1]]
-; GCN-NOT: v_mul_f32
define amdgpu_ps void @ps_ieee_mode_off() #2 {
+; GCN-LABEL: ps_ieee_mode_off:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_min_f32_e32 v0, v0, v1
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_endpgm
%val0 = load volatile float, ptr addrspace(1) poison
%val1 = load volatile float, ptr addrspace(1) poison
%min = call float @llvm.minnum.f32(float %val0, float %val1)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index df9c97f..117af95 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -6551,271 +6551,205 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8:
@@ -15709,61 +15643,61 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -15778,121 +15712,123 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -15903,215 +15839,179 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_4
; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -16133,433 +16033,329 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -42692,271 +42488,205 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8:
@@ -53003,61 +52733,61 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -53072,121 +52802,123 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -53197,215 +52929,179 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB38_4
; GFX11-TRUE16-NEXT: .LBB38_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -53427,433 +53123,329 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2
; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -78968,271 +78560,205 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8:
@@ -88136,61 +87662,61 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -88205,121 +87731,123 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -88330,215 +87858,179 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB58_4
; GFX11-TRUE16-NEXT: .LBB58_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -88560,433 +88052,329 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -113114,271 +112502,205 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v149.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v145.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v133.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8:
@@ -123405,61 +122727,61 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:368
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:328
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:320
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:312
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:304
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:296
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:288
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:280
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:264
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:256
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:248
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:240
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:224
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v113, off, s32 offset:388
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:8
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:104
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128
@@ -123474,121 +122796,123 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:148
; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:20
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:12
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54)
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v29.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v51.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(56)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v71.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v113
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v83.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.l, 8, v97.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v100.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v101.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v101.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.l, 8, v102.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v161.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v162.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v163.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v164.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.l, 8, v165.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v55.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v31.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
@@ -123599,215 +122923,179 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB74_4
; GFX11-TRUE16-NEXT: .LBB74_2: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v145.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v133.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v133.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v129.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v128.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v116.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v116.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v99.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v82.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v65.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l
; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h
; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l
; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l
; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l
; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v150.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v151.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v146.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v147.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v147.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v148.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v134.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v144.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v130.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v130.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v131.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v131.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v117.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v117.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v118.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v113.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v113.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v114.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v114.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v115.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v100.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v101.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v102.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v96.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v97.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v98.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v83.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v84.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v84.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v85.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v85.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v70.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v71.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v80.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v80.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v66.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v66.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v67.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v68.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v55.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v50.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v30.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v52.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
@@ -123829,433 +123117,329 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2
; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v145.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v133.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v129.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v128.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v116.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v103.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v99.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v99.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v87.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v87.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v86.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v83.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v82.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v82.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v70.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v69.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v68.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v65.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v65.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v64.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v53.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, v32.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h
+; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h
+; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v147.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v148.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v134.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v144.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v144.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v131.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v131.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v132.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v117.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v117.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v118.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v118.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v119.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v115.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v100.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v101.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v102.l, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v102.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v96.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v97.l, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v97.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v98.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v83.h, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v84.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v84.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v85.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v85.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v71.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v80.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v80.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v67.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v67.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v68.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.h, v28.l
+; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v55.l, v28.h
+; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.h, v29.l
+; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v51.l, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v52.l, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v52.h, v31.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v16.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v16.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v17.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v18.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v19.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v20.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v20.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v21.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v21.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v22.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v23.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v24.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v25.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v26.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v26.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v27.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v28.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v29.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v30.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v30.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v31.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v31.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -161654,179 +160838,182 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:236
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:136
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:112
-; GFX11-TRUE16-NEXT: s_clause 0x18
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:248
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:136
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:124
+; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v120, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v121, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v122, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v123, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v124, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v125, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v126, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v127, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v136, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v137, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v138, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v139, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v140, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v141, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v142, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v143, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v152, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v153, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v154, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_store_b32 off, v155, s32 offset:12
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_load_b32 v99, off, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_load_b32 v98, off, s32
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:4
+; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr152_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr140_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr123_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr121_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr127_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr136_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr125_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr153_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr139_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr143_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr141_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr155_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr154_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr142_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr138_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr137_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr126_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr124_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr122_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr120_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v31
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
@@ -161835,136 +161022,136 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[13:14]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[11:12]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[7:8]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v99
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v99
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v81
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v81
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v98
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v17
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v80
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[98:99]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v1.l
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[80:81]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[29:30]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v181.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v3.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v183.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v182.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v56.h, v5.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v40.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v79.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v127.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v142.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v125.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v143.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v141.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v72.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v7.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v8.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v9.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v10.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v136.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v11.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v106.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v12.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v153.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v13.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v139.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v14.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v155.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v154.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v17.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v19.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v21.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v21.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v22.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v160.h, v23.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v24.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v24.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v25.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v26.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v26.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v27.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v163.h, v28.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v28.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v29.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v29.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v30.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v30.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v176.h, v98.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v98.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v167.h, v99.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v99.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v80.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v80.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v81.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v81.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5
@@ -161980,7 +161167,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81
; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4
@@ -162019,10 +161206,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v32
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v37, v49, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v37, v49, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v19
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19
@@ -162036,97 +161223,101 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v34, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v135.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v33, v35, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v19
; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v36
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[132:133], 24, v[31:32]
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v17, v34, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v147.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v142, 8, v31
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v34
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v34
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v17, v33, vcc_lo
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v151, v17, v33 :: v_dual_and_b32 v18, 0xffff0000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v33, v20, v35 :: v_dual_and_b32 v18, 0xffff0000, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_cndmask_b32 v33, v20, v35
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v18
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_add_f32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v21, 0x40c00000, v21
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v151.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v22
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v122, 24, v34
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v124, 8, v34
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v149, v19, v35 :: v_dual_lshlrev_b32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v33
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v160, v19, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v24
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v21
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v36, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v149.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v23
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v17, v24, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v17, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v23
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v160.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v19, v35, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v36
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v161.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v26
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v33
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v120, 8, v35
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v20, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v26
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v17, v24, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v20
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v151.h
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v160, v17, v23 :: v_dual_lshlrev_b32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 24, v36
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v110, 8, v36
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162139,8 +161330,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v18
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v161, v19, v23 :: v_dual_lshlrev_b32 v22, 16, v28
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v28
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v163.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
@@ -162153,10 +161346,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v27
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v161.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v150.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v37
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162169,10 +161361,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v49
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v35
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v163, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v38.l, v162.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v165.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
@@ -162185,10 +161377,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v29
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v163.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 24, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v38
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[112:113], 24, v[37:38]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v38
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v38
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
@@ -162201,14 +161393,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v99
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v51
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v51
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v81
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v167.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v99
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v81
; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v53, v17, v24
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v21, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21
@@ -162217,14 +161409,14 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v98
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v160.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v165.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v17, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v80
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v49.l, v164.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v17, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v22
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v98
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v80
; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_cndmask_b32 v52, v19, v24
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v22, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
@@ -162233,10 +161425,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v22, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v53
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v53
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v37
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v167, v19, v23, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v177.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v49
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v49
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v19, v23, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v20, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v2
@@ -162249,10 +161441,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v21, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v167.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v176, v17, v22, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v17, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v1
@@ -162263,13 +161454,12 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff
; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v20, 0x40c00000, v20
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v55
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v55
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v179.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v20, 16, 1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v19, v21, vcc_lo
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
@@ -162282,11 +161472,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v48.l, v162.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v177.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v48
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v18, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v17, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v181, v17, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@@ -162301,9 +161490,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v139, 24, v65
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v65
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v2, v19, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v51.l, v166.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v181.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v182, v2, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v17, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, 0x400000, v3
@@ -162313,13 +161502,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v17, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v179.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 24, v51
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v50.l, v164.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v136, 24, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v1, v18, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v53.l, v176.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v51
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v183, v1, v18, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v6
@@ -162330,13 +161519,13 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[84:85], 24, v[50:51]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v66.l, v183.h
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[85:86], 24, v[48:49]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[37:38]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v137, 8, v67
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v2, v17, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v53
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v53
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v50
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v40, v2, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v5, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v8
@@ -162350,28 +161539,27 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v50
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v56, v2, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v2, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v5, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v6
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v48
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v1, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v2, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v56.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v41.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v60, v3, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v3, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v7
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v83, v1, v8
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v71, v1, v8
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
@@ -162380,29 +161568,29 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v60.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v52.l, v166.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v1, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.l, v46.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[86:87], 24, v[52:53]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v1, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v79, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v72, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v79.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v55.l, v178.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v74, v1, v4 :: v_dual_lshlrev_b32 v1, 16, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v2, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
@@ -162410,40 +161598,40 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v76.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v54.l, v176.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[130:131], 24, v[82:83]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v2, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v67.l, v182.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.l, v74.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.l, v72.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v2, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[54:55]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[52:53]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[128:129], 24, v[70:71]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[82:83], 24, v[54:55]
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v106, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v14
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v106.h
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v104, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.l, v104.h
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v106, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[96:97]
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v65.l, v180.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v3, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v104.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v106.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[117:118], 24, v[84:85]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
@@ -162452,8 +161640,8 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v64.l, v178.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v127, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[118:119], 24, v[33:34]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v136, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
@@ -162461,19 +161649,19 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v4, v5, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v125, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v139, v6, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v40.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v3, v4, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v139.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v130, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
@@ -162481,11 +161669,11 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v125.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v127.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v102.l, v136.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v40.h
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v142, v4, v6, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v153, v4, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
@@ -162494,389 +161682,322 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v142.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v141, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v153.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v154, v2, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v42.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[112:113]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v143, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[99:100], 24, v[130:131]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[114:115], 24, v[102:103]
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v155, v7, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[144:145], 24, v[66:67]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[131:132], 24, v[68:69]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[134:135], 24, v[68:69]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[145:146], 24, v[64:65]
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v134, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v141.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[33:34]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 24, v129
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v129
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v143.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v134
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v134
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v128
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v113
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[133:134]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[101:102], 24, v[128:129]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[102:103], 24, v[35:36]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v133
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v113
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 8, v112
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 24, v97
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 8, v97
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v96
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 24, v83
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v83
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v82
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 24, v69
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v69
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v126, 8, v68
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v138, 8, v66
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v154.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[100:101], 24, v[50:51]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[115:116], 24, v[35:36]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v131
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v155.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 24, v148
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v148
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v131
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 8, v130
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[96:97], 24, v[147:148]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[97:98], 24, v[48:49]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v147
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v103
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v103
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v102
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v89, 24, v85
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v85
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v92, 8, v84
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v71
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v71
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v109, 8, v70
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 24, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v121, 8, v69
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v123, 8, v68
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v125, 24, v67
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v127, 8, v67
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v140, 8, v66
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v141, 24, v65
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v143, 8, v65
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v152, 8, v64
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v54
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v52
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v55
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v55
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v54
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v52
; GFX11-TRUE16-NEXT: .LBB90_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v181.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v152.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v64.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v139.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v180.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v143.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v141.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v183.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v140.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v144.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v177.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v140.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v40.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v138.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v136.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v182.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v127.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v125.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v41.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v123.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v179.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v137.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v134.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v40.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v121.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v56.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v126.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v83.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v107.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h
; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v42.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v123.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v96.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v5, v6
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v79.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v111.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v91.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v5, v7
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v60.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v109.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v75.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v5, v8
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v128.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v106.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v95.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v129.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v5, v9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v61.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v111.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v72.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v109.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v46.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v107.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v105.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v92.l
+; GFX11-TRUE16-NEXT: s_clause 0x1
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v85.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v89.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v136.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v79.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v102.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v106.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v77.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v103.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v75.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v84.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v74.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v91.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v153.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v62.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v130.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v139.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v61.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v131.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v59.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v155.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v57.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v147.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v154.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v47.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v148.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v44.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v149.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v142.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v132.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v138.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v137.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v151.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v126.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v150.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v124.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v76.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v93.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v133.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v5, v10
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v134.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v127.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v89.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v45.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v5, v11
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v104.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v78.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v120.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v5, v12
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v142.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v73.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v34.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v5, v13
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v105.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v125.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v63.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v5, v14
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v122.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v161.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v120.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v35.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v160.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v110.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v108.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v163.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v95.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v162.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v94.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v93.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v165.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v90.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v143.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v58.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v90.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v5, v15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v141.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v47.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v74.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v5, v16
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v135.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v124.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v5, v17
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v59.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.l, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v122.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v50.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v5, v18
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v51.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v19.l, v19.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v20.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v148.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v110.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v44.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v5, v19
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v80.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v48.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v164.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v88.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v78.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v167.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v76.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v166.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v73.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v51.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v63.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v177.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v60.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v52.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v176.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v58.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v147.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v108.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v183.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v5, v20
-; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v150.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v94.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_and_b16 v34.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v5, v21
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v180.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v22.l, v22.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v149.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v92.l
-; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[6:9], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v5, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v160.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v88.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v5, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v24.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v151.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v77.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v5, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v25.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v26.l, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v162.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v72.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v5, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v26.l, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v161.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v62.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v5, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v27.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v164.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v57.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v5, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.l, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v29.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v163.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v46.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v5, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.l, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v166.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v43.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v5, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v30.l, v30.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v165.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v41.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v5, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v31.l, v31.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v176.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v182.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v5, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v32.l, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v33.l, 0xff, v167.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v181.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v5, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v33.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v33.h, v34.l, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v5.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, v5, v33
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v56.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v179.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v45.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v178.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v43.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v55.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v42.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[10:13], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[14:17], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[18:21], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[22:25], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[26:29], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[30:33], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_clause 0x1f
-; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:12
-; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:16
-; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:20
-; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:24
-; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:28
-; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:32
-; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:36
-; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:40
-; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:44
-; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:48
-; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:52
-; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:56
-; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:60
-; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:64
-; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:68
-; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:72
-; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:76
-; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:80
-; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:84
-; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:88
-; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:92
-; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:96
-; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:100
-; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:104
-; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:108
-; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:112
-; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:116
-; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:120
-; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:124
-; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:128
-; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:132
-; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:136
-; GFX11-TRUE16-NEXT: s_clause 0x18
-; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:140
-; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:144
-; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:148
-; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:152
-; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:156
-; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:160
-; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:164
-; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:168
-; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:172
-; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:176
-; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:180
-; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:184
-; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:188
-; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:192
-; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:196
-; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:200
-; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:204
-; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:208
-; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:212
-; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:216
-; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:220
-; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:224
-; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:228
-; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:232
-; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v155, off, s32 offset:12
+; GFX11-TRUE16-NEXT: scratch_load_b32 v154, off, s32 offset:16
+; GFX11-TRUE16-NEXT: scratch_load_b32 v153, off, s32 offset:20
+; GFX11-TRUE16-NEXT: scratch_load_b32 v152, off, s32 offset:24
+; GFX11-TRUE16-NEXT: scratch_load_b32 v143, off, s32 offset:28
+; GFX11-TRUE16-NEXT: scratch_load_b32 v142, off, s32 offset:32
+; GFX11-TRUE16-NEXT: scratch_load_b32 v141, off, s32 offset:36
+; GFX11-TRUE16-NEXT: scratch_load_b32 v140, off, s32 offset:40
+; GFX11-TRUE16-NEXT: scratch_load_b32 v139, off, s32 offset:44
+; GFX11-TRUE16-NEXT: scratch_load_b32 v138, off, s32 offset:48
+; GFX11-TRUE16-NEXT: scratch_load_b32 v137, off, s32 offset:52
+; GFX11-TRUE16-NEXT: scratch_load_b32 v136, off, s32 offset:56
+; GFX11-TRUE16-NEXT: scratch_load_b32 v127, off, s32 offset:60
+; GFX11-TRUE16-NEXT: scratch_load_b32 v126, off, s32 offset:64
+; GFX11-TRUE16-NEXT: scratch_load_b32 v125, off, s32 offset:68
+; GFX11-TRUE16-NEXT: scratch_load_b32 v124, off, s32 offset:72
+; GFX11-TRUE16-NEXT: scratch_load_b32 v123, off, s32 offset:76
+; GFX11-TRUE16-NEXT: scratch_load_b32 v122, off, s32 offset:80
+; GFX11-TRUE16-NEXT: scratch_load_b32 v121, off, s32 offset:84
+; GFX11-TRUE16-NEXT: scratch_load_b32 v120, off, s32 offset:88
+; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:92
+; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:96
+; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:100
+; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:104
+; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:108
+; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:112
+; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:116
+; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:120
+; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:124
+; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:128
+; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:132
+; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:136
+; GFX11-TRUE16-NEXT: s_clause 0x1b
+; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:140
+; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:144
+; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:148
+; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:152
+; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:156
+; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:160
+; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:164
+; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:168
+; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:172
+; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:176
+; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:180
+; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:184
+; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:188
+; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:192
+; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:196
+; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:200
+; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:204
+; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:208
+; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:212
+; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:216
+; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:220
+; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:224
+; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:228
+; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:232
+; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:236
+; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:240
+; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:244
+; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:248
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -186713,69 +185834,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -186784,95 +185905,91 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1]
@@ -186883,345 +186000,283 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1]
; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB94_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8:
@@ -209415,69 +208470,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4
; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
@@ -209486,95 +208541,91 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0]
@@ -209585,345 +208636,283 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0]
; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[13:14]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[11:12]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[5:6]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[80:81], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v13
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v10
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v9
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v5
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[25:26]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[17:18]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v16
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v13
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v12
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v8
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v29
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v26
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v25
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v23
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v21
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v20
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v19
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v18
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v3
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v31
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v30
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v29
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v28
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v27
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v26
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v23
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v20
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v19
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v18
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 8, v17
; GFX11-TRUE16-NEXT: .LBB98_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v176.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v80.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v167.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v166.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v165.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v70.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v33.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v36.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v160.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v150.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v69.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v148.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v149.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v66.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v36.h, 8, v134.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v135.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v37.h, 8, v133.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v36.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v129.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v87.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v148.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v134.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v86.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31
-; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v35.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v9.h, v34.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v98.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v96.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v10.l, v35.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v86.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v16.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v67.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v164.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v163.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v161.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v64.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v151.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v149.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v147.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v52.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v145.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v135.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v132.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v131.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v130.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v19.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v117.l
+; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v115.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v113.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v19.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v20.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v21.h, v22.l
+; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v103.l
+; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v97.l
+; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v21.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v22.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v23.l, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v24.l, v24.h
+; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v30.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v87.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v85.l
+; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v31.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v32.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v83.l
+; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.h, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v25.h, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v26.h, v27.l
; GFX11-TRUE16-NEXT: s_clause 0x5
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96
-; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96
+; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
index 64b5ecc..582f31b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll
@@ -4125,19 +4125,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -4152,94 +4152,71 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8614,19 +8591,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -8641,94 +8618,71 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -12703,19 +12657,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -12730,94 +12684,71 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16408,19 +16339,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -16435,94 +16366,71 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19833,19 +19741,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -19860,94 +19768,71 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -22745,19 +22630,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -22772,94 +22657,71 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -24960,19 +24822,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16
@@ -24987,94 +24849,71 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v6.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v8.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
index cb4b3bd..0a73571 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll
@@ -6298,31 +6298,33 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -6335,48 +6337,43 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -6387,122 +6384,88 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13349,31 +13312,33 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -13386,48 +13351,43 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -13438,122 +13398,88 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -19888,31 +19814,33 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -19925,48 +19853,43 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -19977,122 +19900,88 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -25929,31 +25818,33 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -25966,48 +25857,43 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
@@ -26018,122 +25904,88 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v15.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
index 3aaf254..b622e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll
@@ -3044,91 +3044,66 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8:
@@ -5025,39 +5000,41 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3
@@ -5071,63 +5048,53 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -5140,147 +5107,110 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9991,91 +9921,66 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8:
@@ -11997,39 +11902,41 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v17.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3
@@ -12043,63 +11950,53 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v23.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v12.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
@@ -12112,147 +12009,110 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v15.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v16.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v17.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v18.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v19.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v12.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16367,91 +16227,66 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8:
@@ -22484,91 +22319,66 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8:
@@ -28791,39 +28601,38 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3
@@ -28837,65 +28646,55 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
@@ -28906,146 +28705,110 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -30878,91 +30641,66 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8:
@@ -32912,39 +32650,38 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3
@@ -32958,65 +32695,55 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v23.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v33.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
@@ -33027,146 +32754,110 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2
; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v23.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v22.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v21.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v24.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v25.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v26.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v27.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v16.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v18.l, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -35022,91 +34713,66 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v18.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v13.l
; GFX11-TRUE16-NEXT: s_clause 0x2
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
-; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32
+; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
index 632b03c..e6c7b1a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll
@@ -2279,17 +2279,13 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -2301,13 +2297,9 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4530,17 +4522,13 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2
; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true
@@ -4552,13 +4540,9 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6487,17 +6471,13 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2
; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true
@@ -6509,13 +6489,9 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8138,17 +8114,13 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
@@ -8160,13 +8132,9 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9502,17 +9470,13 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2
; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true
@@ -9524,13 +9488,9 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10212,17 +10172,13 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2
; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true
@@ -10234,13 +10190,9 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
index d3fbba3..bff054f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll
@@ -8921,133 +8921,98 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -12574,53 +12539,52 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3
@@ -12633,98 +12597,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -12745,226 +12693,170 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -23576,133 +23468,98 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -27358,53 +27215,52 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3
@@ -27417,98 +27273,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -27529,226 +27369,170 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -37760,133 +37544,98 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -41418,53 +41167,52 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3
@@ -41477,98 +41225,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -41589,226 +41321,170 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -50954,133 +50630,98 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -54638,53 +54279,52 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v22.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.h, v6.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v80.h
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3
@@ -54697,98 +54337,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v50.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v50.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v29.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v54.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v51.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v52.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v30.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v39.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v48.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v48.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v24.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v25.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v26.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v27.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v22.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v16.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v18.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16
@@ -54809,226 +54433,170 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v50.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v29.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v27.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v23.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3
+; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v37.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v33.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.h, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v31.l, 3
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v51.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v51.h, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v52.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v52.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v53.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v30.h, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v39.l, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v48.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v24.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v25.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v26.h, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v27.l, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v21.l, v8.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v21.h, v9.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v22.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v23.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v16.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v18.l, v15.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v4.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v5.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v6.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v8.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v10.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v10.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v11.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v12.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v13.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v14.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v15.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -64107,133 +63675,98 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -76401,133 +75934,98 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h
; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l
; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v51.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v22.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h
; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v20.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l
; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v50.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v19.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v20.h
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v19.h
; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l
; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v28.l
; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v27.l
; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v26.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v21.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v20.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v20.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v18.l
; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v18.h
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
@@ -85053,57 +84551,57 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -85111,29 +84609,29 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_2
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[7:8]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[5:6]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[11:12]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[3:4]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[5:6]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[15:16]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[3:4]
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v15
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v14
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v15
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v13
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v12
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v11
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v10
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v11
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v8
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v7
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v6
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v7
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v6
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v3
@@ -85141,11 +84639,11 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v1
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[13:14]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[9:10]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[1:2]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[7:8]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[1:2]
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v2.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v2.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v3.h
@@ -85155,26 +84653,26 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v5.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v7.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v7.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v7.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v9.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v10.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v10.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v11.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v11.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v12.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v12.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v13.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v13.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v14.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v14.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v15.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v16.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v16.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v15.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v15.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v16.l
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v16.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5
@@ -85187,71 +84685,72 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_lshlrev_b32 v4, 16, v4
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v18
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v17 :: v_dual_add_f32 v20, 0x40c00000, v18
; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v17
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17
; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v18, v23 :: v_dual_lshlrev_b32 v1, 16, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v39, v18, v23 :: v_dual_and_b32 v2, 0xffff0000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v20, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, 0x400000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_add3_u32 v17, v24, v20, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v1, 0x7fff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v18, v21, v22, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v20
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v23, v26, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v3
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v17, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v4, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v4
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v20
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v20 :: v_dual_add_f32 v3, 0x40c00000, v3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v53.h
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v22, 16, 1
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v19, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v22, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v18
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v22
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v1, v20, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v18
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v17
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v55, v1, v19 :: v_dual_and_b32 v2, 0xffff0000, v6
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v1, v19, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v54.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v19, v4, v21, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
@@ -85304,305 +84803,266 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v10
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v65.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v66, v4, v5 :: v_dual_lshlrev_b32 v5, 16, v10
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v66, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v7
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v7 :: v_dual_lshlrev_b32 v5, 16, v10
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v21
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v24, v1, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v22
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v21
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[21:22]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[17:18]
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v1, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v22
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v22
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v3, v8 :: v_dual_and_b32 v3, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v3, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v12
+; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v5, 16, v12
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v9
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v68, v1, v4 :: v_dual_add_f32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v6
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v1, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v9
-; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v11
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v66.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v2, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v68.h
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v67.h
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v67.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v23
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v25, v2, v6, vcc_lo
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[21:22]
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20]
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v68.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add3_u32 v1, v6, v5, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v2, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v5
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v14
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v82.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[17:18]
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v1, v2 :: v_dual_lshlrev_b32 v1, 16, v11
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v1, v2, vcc_lo
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v11
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v6, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v26
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v7, v4, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v6, 0x7fff
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v14
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v26
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v80.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v26
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v3, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v13
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v80.h
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 24, v24
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v71.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v27, v2, v3 :: v_dual_add_f32 v2, 0x40c00000, v4
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v28
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v4, v5, vcc_lo
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v2, 0x7fff
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v8, v3, 0x7fff
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v4, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v13
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v8, v1, 0x7fff
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v97.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v6, v7, vcc_lo
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v98, v6, v7 :: v_dual_and_b32 v5, 0xffff0000, v16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v9
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v28
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v24
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28]
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h
+; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v15
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v15
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v96.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v3, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v16
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v33
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v27
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v23
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v112, v4, v6 :: v_dual_add_f32 v1, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v8 :: v_dual_lshlrev_b32 v5, 16, v15
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v112.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
-; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
-; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32
-; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v98.h
; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
-; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v2, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[27:28]
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[25:26]
; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v3, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v113, v4, v6 :: v_dual_add_f32 v6, 0x40c00000, v8
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v3, 0x7fff
+; GFX11-TRUE16-NEXT: v_add3_u32 v4, v7, v1, 0x7fff
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1
+; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v6
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v2, v9, vcc_lo
-; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v2, v9, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v113.h
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[23:24]
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v10, v6, 0x7fff
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v7, v11, vcc_lo
+; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v33
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v33
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v32
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v27
+; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v7, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v4, v8, vcc_lo
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v4, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v103.h
-; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v2, v3, vcc_lo
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v113.h
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v115.h
+; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v2, v3, vcc_lo
+; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v117.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v38
-; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[37:38]
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 24, v39
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v39
+; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[29:30], 24, v[38:39]
; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[30:31], 24, v[32:33]
-; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v37
+; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v38
; GFX11-TRUE16-NEXT: .LBB108_4: ; %end
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v131.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v51.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v50.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v18.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v1.l, v1.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v129.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v55.h
; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v52.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v130.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v19.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v50.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v1
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v2.l, v2.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v128.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v19.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v49.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v54.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v119.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h
; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v55.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v128.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v118.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v21.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v2
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v3.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v54.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v119.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v22.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v115.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v3
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v118.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v65.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v116.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v21.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v48.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v64.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v114.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v22.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v112.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v7.h, v8.l
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.h, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v67.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v103.l
; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v23.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v4.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v65.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v117.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v48.l
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v24.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v4
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v101.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v5.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v64.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v116.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v25.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v5
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v6.l, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v7.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v68.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v114.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v98.l
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v27.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v6
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v34.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v7.l, v7.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v66.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v101.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v80.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v100.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h
; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v66.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v102.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v10.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v11.h
+; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v25.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v68.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v99.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v26.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v97.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v96.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v87.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v27.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v34.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.h, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v13.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v71.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l
; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v28.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v85.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v7
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v8.l, v8.h
-; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v82.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v100.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.l
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v33.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v8
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v81.l
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v9.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v67.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v99.l
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v37.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v9
-; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v38.h
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v10.l, v10.h
-; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v97.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v96.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v69.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v10
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v11.l, v11.h
-; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v12.l, v12.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v80.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v86.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v11
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v113.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v84.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v32.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v98.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v83.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v112.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v84.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v13.l, v13.h
-; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v87.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v83.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v13
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v14.l, v14.h
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v113.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v71.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v15.l, v15.h
-; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h
-; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v103.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v70.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v15
-; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v16.l, v16.h
-; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v16
+; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v14.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v15.l, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.l, v16.h
+; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v33.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v82.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v117.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v81.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v38.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v115.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v70.l
+; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v39.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v69.l
+; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.h, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v17.h, v18.l
+; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v18.h, v19.l
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off
; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
index ecc715c..11f90b9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll
@@ -3067,9 +3067,9 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -3085,52 +3085,47 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2
; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6210,9 +6205,9 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -6228,52 +6223,47 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2
; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -9050,9 +9040,9 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -9068,52 +9058,47 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2
; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -11590,9 +11575,9 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -11608,52 +11593,47 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2
; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -13809,9 +13789,9 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -13827,52 +13807,47 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2
; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -15630,9 +15605,9 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -15648,52 +15623,47 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2
; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -16934,9 +16904,9 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1
@@ -16952,52 +16922,47 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2
; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
index 685e2fb..9a6ea1b 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll
@@ -1104,16 +1104,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -1128,37 +1127,28 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2
; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true
@@ -1166,36 +1156,26 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -4254,16 +4234,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v8.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -4278,37 +4257,28 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v3.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.l
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4
-; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2
; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true
@@ -4316,36 +4286,26 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -6909,12 +6869,12 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -6929,37 +6889,28 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2
; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true
@@ -6967,36 +6918,26 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -8669,12 +8610,12 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -8689,37 +8630,28 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2
; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true
@@ -8727,36 +8659,26 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -10079,12 +10001,12 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v10.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12
@@ -10099,37 +10021,28 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.l
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16
+; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2
; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true
@@ -10137,36 +10050,26 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) {
; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h
; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v5.l, v2.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l
; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index cbf6b66..7dbbeaa 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -3632,13 +3632,9 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -3813,16 +3809,12 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec
; GFX1250-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v17.l
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v1.l
-; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-TRUE16-NEXT: v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
-; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
-; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v0, off
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_bitop3_b16 v1.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT: global_store_b32 v[0:1], v1, off
; GFX1250-TRUE16-NEXT: s_endpgm
;
; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 26f204f..14897b6 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1771,33 +1771,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX11-TRUE16-NEXT: global_load_b32 v5, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9
-; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 9
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v5.h, 9
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v5.h
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l
-; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h
-; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
+; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x900, v4.l
+; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x900, v4.h
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 78a961e..415828f 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -4858,7 +4858,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v4, v2
; SI-NEXT: s_cbranch_vccz .LBB9_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else20
; SI-NEXT: v_bfi_b32 v7, s0, 0, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6
@@ -4869,7 +4869,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB9_2:
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB9_3: ; %frem.compute
+; SI-NEXT: .LBB9_3: ; %frem.compute19
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v5|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v5
@@ -4905,10 +4905,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB9_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB9_5: ; %frem.loop_body
+; SI-NEXT: .LBB9_5: ; %frem.loop_body27
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -4923,7 +4923,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB9_5
; SI-NEXT: ; %bb.6: ; %Flow55
; SI-NEXT: v_mov_b32_e32 v5, v7
-; SI-NEXT: .LBB9_7: ; %frem.loop_exit
+; SI-NEXT: .LBB9_7: ; %frem.loop_exit28
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v5, v5, s1
; SI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -4944,7 +4944,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v7, |v7|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v7
; SI-NEXT: s_cbranch_vccz .LBB9_10
-; SI-NEXT: ; %bb.9: ; %frem.else20
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v8, s0, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
@@ -4956,7 +4956,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB9_10:
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB9_11: ; %frem.compute19
+; SI-NEXT: .LBB9_11: ; %frem.compute
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v6|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v6
@@ -4992,10 +4992,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB9_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB9_13: ; %frem.loop_body27
+; SI-NEXT: .LBB9_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v8, v6
; SI-NEXT: v_mul_f32_e32 v6, v8, v7
@@ -5010,7 +5010,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB9_13
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; SI-NEXT: .LBB9_15: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v6, v6, s1
; SI-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -5084,7 +5084,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB9_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else20
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_bfi_b32 v7, s0, 0, v2
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -5093,7 +5093,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_8
; CI-NEXT: .LBB9_2:
; CI-NEXT: ; implicit-def: $vgpr4
-; CI-NEXT: .LBB9_3: ; %frem.compute
+; CI-NEXT: .LBB9_3: ; %frem.compute19
; CI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; CI-NEXT: v_frexp_mant_f32_e32 v4, v6
; CI-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -5118,10 +5118,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10
; CI-NEXT: v_add_i32_e32 v6, vcc, 11, v6
-; CI-NEXT: .LBB9_5: ; %frem.loop_body
+; CI-NEXT: .LBB9_5: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -5136,7 +5136,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB9_5
; CI-NEXT: ; %bb.6: ; %Flow55
; CI-NEXT: v_mov_b32_e32 v7, v9
-; CI-NEXT: .LBB9_7: ; %frem.loop_exit
+; CI-NEXT: .LBB9_7: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v6, vcc, -10, v6
; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6
; CI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -5157,7 +5157,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v6, |v6|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6
; CI-NEXT: s_cbranch_vccz .LBB9_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v8, s0, 0, v0
@@ -5167,7 +5167,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB9_16
; CI-NEXT: .LBB9_10:
; CI-NEXT: ; implicit-def: $vgpr5
-; CI-NEXT: .LBB9_11: ; %frem.compute19
+; CI-NEXT: .LBB9_11: ; %frem.compute
; CI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7
; CI-NEXT: v_frexp_mant_f32_e32 v5, v7
; CI-NEXT: v_frexp_mant_f32_e32 v7, v6
@@ -5192,10 +5192,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7
; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB9_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11
; CI-NEXT: v_add_i32_e32 v7, vcc, 11, v7
-; CI-NEXT: .LBB9_13: ; %frem.loop_body27
+; CI-NEXT: .LBB9_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v10, v8
; CI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -5210,7 +5210,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB9_13
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v8, v10
-; CI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB9_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v7, vcc, -10, v7
; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7
; CI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -5275,7 +5275,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v3, |v1|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; VI-NEXT: s_cbranch_vccz .LBB9_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else20
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v2, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
@@ -5284,7 +5284,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_8
; VI-NEXT: .LBB9_2:
; VI-NEXT: ; implicit-def: $vgpr2
-; VI-NEXT: .LBB9_3: ; %frem.compute
+; VI-NEXT: .LBB9_3: ; %frem.compute19
; VI-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
; VI-NEXT: v_frexp_mant_f32_e32 v2, v4
; VI-NEXT: v_frexp_mant_f32_e32 v4, v3
@@ -5309,10 +5309,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4
; VI-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; VI-NEXT: v_sub_u32_e32 v4, vcc, v7, v8
; VI-NEXT: v_add_u32_e32 v4, vcc, 11, v4
-; VI-NEXT: .LBB9_5: ; %frem.loop_body
+; VI-NEXT: .LBB9_5: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v7, v5
; VI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -5327,7 +5327,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB9_5
; VI-NEXT: ; %bb.6: ; %Flow55
; VI-NEXT: v_mov_b32_e32 v5, v7
-; VI-NEXT: .LBB9_7: ; %frem.loop_exit
+; VI-NEXT: .LBB9_7: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v4, vcc, -10, v4
; VI-NEXT: v_ldexp_f32 v4, v5, v4
; VI-NEXT: v_mul_f32_e32 v5, v4, v6
@@ -5347,7 +5347,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v6, |v4|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v7, v6
; VI-NEXT: s_cbranch_vccz .LBB9_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v5, s2, 0, v3
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v7, v6
@@ -5356,7 +5356,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB9_16
; VI-NEXT: .LBB9_10:
; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: .LBB9_11: ; %frem.compute19
+; VI-NEXT: .LBB9_11: ; %frem.compute
; VI-NEXT: v_frexp_exp_i32_f32_e32 v10, v7
; VI-NEXT: v_frexp_mant_f32_e32 v5, v7
; VI-NEXT: v_frexp_mant_f32_e32 v7, v6
@@ -5381,10 +5381,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v7
; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB9_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11
; VI-NEXT: v_add_u32_e32 v7, vcc, 11, v7
-; VI-NEXT: .LBB9_13: ; %frem.loop_body27
+; VI-NEXT: .LBB9_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v10, v8
; VI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -5399,7 +5399,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB9_13
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v8, v10
-; VI-NEXT: .LBB9_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB9_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v7, vcc, -10, v7
; VI-NEXT: v_ldexp_f32 v7, v8, v7
; VI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -5443,7 +5443,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v3, |v0|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v4, v3
; GFX9-NEXT: s_cbranch_vccz .LBB9_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else20
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v2, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v4, v3
@@ -5452,7 +5452,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB9_8
; GFX9-NEXT: .LBB9_2:
; GFX9-NEXT: ; implicit-def: $vgpr2
-; GFX9-NEXT: .LBB9_3: ; %frem.compute
+; GFX9-NEXT: .LBB9_3: ; %frem.compute19
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v7, v4
; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v3
@@ -5477,10 +5477,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v4
; GFX9-NEXT: v_div_fixup_f32 v6, v6, v3, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX9-NEXT: v_sub_u32_e32 v4, v7, v8
; GFX9-NEXT: v_add_u32_e32 v4, 11, v4
-; GFX9-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v7, v5
; GFX9-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -5495,7 +5495,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB9_5
; GFX9-NEXT: ; %bb.6: ; %Flow55
; GFX9-NEXT: v_mov_b32_e32 v5, v7
-; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX9-NEXT: v_add_u32_e32 v4, -10, v4
; GFX9-NEXT: v_ldexp_f32 v4, v5, v4
; GFX9-NEXT: v_mul_f32_e32 v5, v4, v6
@@ -5514,7 +5514,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; GFX9-NEXT: s_cbranch_vccz .LBB9_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else20
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -5523,7 +5523,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB9_16
; GFX9-NEXT: .LBB9_10:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB9_11: ; %frem.compute19
+; GFX9-NEXT: .LBB9_11: ; %frem.compute
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -5548,10 +5548,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 11, v6
-; GFX9-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX9-NEXT: .LBB9_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -5566,7 +5566,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB9_13
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX9-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v6, -10, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -5612,7 +5612,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v3, |v0|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX10-NEXT: s_cbranch_vccz .LBB9_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else20
; GFX10-NEXT: v_bfi_b32 v2, 0x7fff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX10-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc_lo
@@ -5620,7 +5620,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB9_8
; GFX10-NEXT: .LBB9_2:
; GFX10-NEXT: ; implicit-def: $vgpr2
-; GFX10-NEXT: .LBB9_3: ; %frem.compute
+; GFX10-NEXT: .LBB9_3: ; %frem.compute19
; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -5647,10 +5647,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v7, v4
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -5666,7 +5666,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow55
; GFX10-NEXT: v_mov_b32_e32 v6, s2
; GFX10-NEXT: v_mov_b32_e32 v4, v7
-; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX10-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX10-NEXT: v_ldexp_f32 v4, v4, v6
; GFX10-NEXT: v_mul_f32_e32 v5, v4, v5
@@ -5684,7 +5684,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v6, |v3|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v4
; GFX10-NEXT: s_cbranch_vccz .LBB9_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else20
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v5, 0x7fff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v4
; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v5, vcc_lo
@@ -5692,7 +5692,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB9_16
; GFX10-NEXT: .LBB9_10:
; GFX10-NEXT: ; implicit-def: $vgpr5
-; GFX10-NEXT: .LBB9_11: ; %frem.compute19
+; GFX10-NEXT: .LBB9_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v6
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
; GFX10-NEXT: v_ldexp_f32 v6, v5, 11
@@ -5719,10 +5719,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX10-NEXT: .LBB9_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -5738,7 +5738,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX10-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -5782,7 +5782,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
@@ -5793,7 +5793,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB9_8
; GFX11-TRUE16-NEXT: .LBB9_2:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX11-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -5829,11 +5829,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX11-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -5853,7 +5853,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX11-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX11-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -5880,7 +5880,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB9_10
-; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
@@ -5891,7 +5891,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB9_16
; GFX11-TRUE16-NEXT: .LBB9_10:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX11-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v6
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v6, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -5927,11 +5927,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX11-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v7
@@ -5951,7 +5951,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX11-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v9, -10, v9
; GFX11-TRUE16-NEXT: v_ldexp_f32 v7, v7, v9
@@ -6002,7 +6002,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_2
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, v0
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v4, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -6011,7 +6011,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB9_8
; GFX11-FAKE16-NEXT: .LBB9_2:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX11-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, v4
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v3
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, v4
@@ -6047,11 +6047,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX11-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -6071,7 +6071,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX11-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX11-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -6097,7 +6097,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v7, v5
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB9_10
-; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX11-FAKE16-NEXT: v_bfi_b32 v6, 0x7fff, 0, v3
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v7, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -6106,7 +6106,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB9_16
; GFX11-FAKE16-NEXT: .LBB9_10:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr6
-; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX11-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v6, v7
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v8, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
@@ -6142,11 +6142,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX11-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v7
@@ -6166,7 +6166,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX11-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, -10, v9
; GFX11-FAKE16-NEXT: v_ldexp_f32 v7, v7, v9
@@ -6220,7 +6220,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
@@ -6232,7 +6232,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB9_8
; GFX1150-TRUE16-NEXT: .LBB9_2:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1150-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6267,11 +6267,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s5, s6, s5
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s5, s5, 11
-; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1150-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6293,7 +6293,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1150-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6323,7 +6323,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s7
@@ -6335,7 +6335,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB9_16
; GFX1150-TRUE16-NEXT: .LBB9_10:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1150-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6370,11 +6370,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s7, s8, s7
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s7, s7, 11
-; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1150-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6396,7 +6396,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1150-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6459,7 +6459,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6469,7 +6469,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB9_8
; GFX1150-FAKE16-NEXT: .LBB9_2:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1150-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6504,11 +6504,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s5, s6, s5
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s5, s5, 11
-; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1150-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6530,7 +6530,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1150-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6559,7 +6559,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s7
; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6569,7 +6569,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB9_16
; GFX1150-FAKE16-NEXT: .LBB9_10:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1150-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6604,11 +6604,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s7, s8, s7
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s7, s7, 11
-; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1150-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6630,7 +6630,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1150-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6690,7 +6690,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s4
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s6, s5
@@ -6702,7 +6702,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB9_8
; GFX1200-TRUE16-NEXT: .LBB9_2:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1200-TRUE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6737,11 +6737,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s5, s6, s5
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s5, s5, 11
-; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1200-TRUE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -6765,7 +6765,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow55
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1200-TRUE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -6799,7 +6799,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s6
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s7
@@ -6811,7 +6811,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB9_16
; GFX1200-TRUE16-NEXT: .LBB9_10:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1200-TRUE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -6847,11 +6847,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s7, s8, s7
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s7, s7, 11
-; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1200-TRUE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -6875,7 +6875,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1200-TRUE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -6940,7 +6940,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s6, s5
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_2
-; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else20
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s6, s5
; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s4
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -6950,7 +6950,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB9_8
; GFX1200-FAKE16-NEXT: .LBB9_2:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute
+; GFX1200-FAKE16-NEXT: .LBB9_3: ; %frem.compute19
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s5
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s6
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -6986,11 +6986,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_7
-; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body27.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s5, s6, s5
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s5, s5, 11
-; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body
+; GFX1200-FAKE16-NEXT: .LBB9_5: ; %frem.loop_body27
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -7014,7 +7014,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow55
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s5
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit
+; GFX1200-FAKE16-NEXT: .LBB9_7: ; %frem.loop_exit28
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -7047,7 +7047,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s7
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB9_10
-; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s7
; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s6
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -7058,7 +7058,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB9_16
; GFX1200-FAKE16-NEXT: .LBB9_10:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute19
+; GFX1200-FAKE16-NEXT: .LBB9_11: ; %frem.compute
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s7
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s8
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s8
@@ -7094,11 +7094,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB9_15
-; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s7, s8, s7
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s7, s7, 11
-; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body27
+; GFX1200-FAKE16-NEXT: .LBB9_13: ; %frem.loop_body
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -7122,7 +7122,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit28
+; GFX1200-FAKE16-NEXT: .LBB9_15: ; %frem.loop_exit
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -7208,7 +7208,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: v_cvt_f16_f32_e32 v8, v6
; SI-NEXT: s_cbranch_vccz .LBB10_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else86
; SI-NEXT: v_bfi_b32 v11, s0, 0, v6
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10
@@ -7219,7 +7219,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_2:
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_3: ; %frem.compute
+; SI-NEXT: .LBB10_3: ; %frem.compute85
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v9|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v9
@@ -7255,10 +7255,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_5: ; %frem.loop_body
+; SI-NEXT: .LBB10_5: ; %frem.loop_body93
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v11, v9
; SI-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -7273,7 +7273,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_5
; SI-NEXT: ; %bb.6: ; %Flow133
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: .LBB10_7: ; %frem.loop_exit
+; SI-NEXT: .LBB10_7: ; %frem.loop_exit94
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v9, v9, s1
; SI-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -7294,7 +7294,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v11, |v11|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v11
; SI-NEXT: s_cbranch_vccz .LBB10_10
-; SI-NEXT: ; %bb.9: ; %frem.else20
+; SI-NEXT: ; %bb.9: ; %frem.else53
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v12, s0, 0, v4
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
@@ -7306,7 +7306,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_10:
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_11: ; %frem.compute19
+; SI-NEXT: .LBB10_11: ; %frem.compute52
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v10|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v10
@@ -7342,10 +7342,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_13: ; %frem.loop_body27
+; SI-NEXT: .LBB10_13: ; %frem.loop_body60
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v12, v10
; SI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -7360,7 +7360,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_13
; SI-NEXT: ; %bb.14: ; %Flow129
; SI-NEXT: v_mov_b32_e32 v10, v12
-; SI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; SI-NEXT: .LBB10_15: ; %frem.loop_exit61
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v10, v10, s1
; SI-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -7381,7 +7381,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v12, |v12|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v12
; SI-NEXT: s_cbranch_vccz .LBB10_18
-; SI-NEXT: ; %bb.17: ; %frem.else53
+; SI-NEXT: ; %bb.17: ; %frem.else20
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v13, s0, 0, v2
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
@@ -7393,7 +7393,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_18:
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_19: ; %frem.compute52
+; SI-NEXT: .LBB10_19: ; %frem.compute19
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v11|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v11
@@ -7429,10 +7429,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_23
-; SI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; SI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_21: ; %frem.loop_body60
+; SI-NEXT: .LBB10_21: ; %frem.loop_body27
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v13, v11
; SI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -7447,7 +7447,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_21
; SI-NEXT: ; %bb.22: ; %Flow125
; SI-NEXT: v_mov_b32_e32 v11, v13
-; SI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; SI-NEXT: .LBB10_23: ; %frem.loop_exit28
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v11, v11, s1
; SI-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -7468,7 +7468,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cvt_f32_f16_e64 v13, |v13|
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v13
; SI-NEXT: s_cbranch_vccz .LBB10_26
-; SI-NEXT: ; %bb.25: ; %frem.else86
+; SI-NEXT: ; %bb.25: ; %frem.else
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_bfi_b32 v14, s0, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
@@ -7480,7 +7480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB10_26:
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB10_27: ; %frem.compute85
+; SI-NEXT: .LBB10_27: ; %frem.compute
; SI-NEXT: s_mov_b32 s3, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 vcc, |v12|, s3
; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v12
@@ -7516,10 +7516,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; SI-NEXT: s_cmp_lt_i32 s1, 12
; SI-NEXT: s_cbranch_scc1 .LBB10_31
-; SI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s1, s2, s3
; SI-NEXT: s_add_i32 s1, s1, 11
-; SI-NEXT: .LBB10_29: ; %frem.loop_body93
+; SI-NEXT: .LBB10_29: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v14, v12
; SI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -7534,7 +7534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB10_29
; SI-NEXT: ; %bb.30: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v14
-; SI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; SI-NEXT: .LBB10_31: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s1, s1, -10
; SI-NEXT: v_ldexp_f32_e64 v12, v12, s1
; SI-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -7638,7 +7638,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v7
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB10_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else86
; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
; CI-NEXT: v_bfi_b32 v11, s0, 0, v6
; CI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9
@@ -7647,7 +7647,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_8
; CI-NEXT: .LBB10_2:
; CI-NEXT: ; implicit-def: $vgpr8
-; CI-NEXT: .LBB10_3: ; %frem.compute
+; CI-NEXT: .LBB10_3: ; %frem.compute85
; CI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10
; CI-NEXT: v_frexp_mant_f32_e32 v8, v10
; CI-NEXT: v_frexp_mant_f32_e32 v10, v9
@@ -7672,10 +7672,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10
; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14
; CI-NEXT: v_add_i32_e32 v10, vcc, 11, v10
-; CI-NEXT: .LBB10_5: ; %frem.loop_body
+; CI-NEXT: .LBB10_5: ; %frem.loop_body93
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v13, v11
; CI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -7690,7 +7690,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_5
; CI-NEXT: ; %bb.6: ; %Flow133
; CI-NEXT: v_mov_b32_e32 v11, v13
-; CI-NEXT: .LBB10_7: ; %frem.loop_exit
+; CI-NEXT: .LBB10_7: ; %frem.loop_exit94
; CI-NEXT: v_add_i32_e32 v10, vcc, -10, v10
; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10
; CI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -7711,7 +7711,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v10, |v10|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10
; CI-NEXT: s_cbranch_vccz .LBB10_10
-; CI-NEXT: ; %bb.9: ; %frem.else20
+; CI-NEXT: ; %bb.9: ; %frem.else53
; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v12, s0, 0, v4
@@ -7721,7 +7721,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_16
; CI-NEXT: .LBB10_10:
; CI-NEXT: ; implicit-def: $vgpr9
-; CI-NEXT: .LBB10_11: ; %frem.compute19
+; CI-NEXT: .LBB10_11: ; %frem.compute52
; CI-NEXT: v_frexp_exp_i32_f32_e32 v14, v11
; CI-NEXT: v_frexp_mant_f32_e32 v9, v11
; CI-NEXT: v_frexp_mant_f32_e32 v11, v10
@@ -7746,10 +7746,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11
; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v11, vcc, 11, v11
-; CI-NEXT: .LBB10_13: ; %frem.loop_body27
+; CI-NEXT: .LBB10_13: ; %frem.loop_body60
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v14, v12
; CI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -7764,7 +7764,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_13
; CI-NEXT: ; %bb.14: ; %Flow129
; CI-NEXT: v_mov_b32_e32 v12, v14
-; CI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; CI-NEXT: .LBB10_15: ; %frem.loop_exit61
; CI-NEXT: v_add_i32_e32 v11, vcc, -10, v11
; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11
; CI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -7785,7 +7785,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v11, |v11|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v12, v11
; CI-NEXT: s_cbranch_vccz .LBB10_18
-; CI-NEXT: ; %bb.17: ; %frem.else53
+; CI-NEXT: ; %bb.17: ; %frem.else20
; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v13, s0, 0, v2
@@ -7795,7 +7795,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_24
; CI-NEXT: .LBB10_18:
; CI-NEXT: ; implicit-def: $vgpr10
-; CI-NEXT: .LBB10_19: ; %frem.compute52
+; CI-NEXT: .LBB10_19: ; %frem.compute19
; CI-NEXT: v_frexp_exp_i32_f32_e32 v15, v12
; CI-NEXT: v_frexp_mant_f32_e32 v10, v12
; CI-NEXT: v_frexp_mant_f32_e32 v12, v11
@@ -7820,10 +7820,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v12
; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_23
-; CI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16
; CI-NEXT: v_add_i32_e32 v12, vcc, 11, v12
-; CI-NEXT: .LBB10_21: ; %frem.loop_body60
+; CI-NEXT: .LBB10_21: ; %frem.loop_body27
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v13
; CI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -7838,7 +7838,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_21
; CI-NEXT: ; %bb.22: ; %Flow125
; CI-NEXT: v_mov_b32_e32 v13, v15
-; CI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; CI-NEXT: .LBB10_23: ; %frem.loop_exit28
; CI-NEXT: v_add_i32_e32 v12, vcc, -10, v12
; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12
; CI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -7859,7 +7859,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cvt_f32_f16_e64 v12, |v12|
; CI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12
; CI-NEXT: s_cbranch_vccz .LBB10_26
-; CI-NEXT: ; %bb.25: ; %frem.else86
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_bfi_b32 v14, s0, 0, v0
@@ -7869,7 +7869,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB10_32
; CI-NEXT: .LBB10_26:
; CI-NEXT: ; implicit-def: $vgpr11
-; CI-NEXT: .LBB10_27: ; %frem.compute85
+; CI-NEXT: .LBB10_27: ; %frem.compute
; CI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13
; CI-NEXT: v_frexp_mant_f32_e32 v11, v13
; CI-NEXT: v_frexp_mant_f32_e32 v13, v12
@@ -7894,10 +7894,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13
; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB10_31
-; CI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v13, vcc, 11, v13
-; CI-NEXT: .LBB10_29: ; %frem.loop_body93
+; CI-NEXT: .LBB10_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v16, v14
; CI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -7912,7 +7912,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB10_29
; CI-NEXT: ; %bb.30: ; %Flow
; CI-NEXT: v_mov_b32_e32 v14, v16
-; CI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; CI-NEXT: .LBB10_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v13, vcc, -10, v13
; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13
; CI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -8001,7 +8001,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v5, |v2|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; VI-NEXT: s_cbranch_vccz .LBB10_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else86
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v4, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -8010,7 +8010,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_8
; VI-NEXT: .LBB10_2:
; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: .LBB10_3: ; %frem.compute
+; VI-NEXT: .LBB10_3: ; %frem.compute85
; VI-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; VI-NEXT: v_frexp_mant_f32_e32 v4, v6
; VI-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -8035,10 +8035,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10
; VI-NEXT: v_add_u32_e32 v6, vcc, 11, v6
-; VI-NEXT: .LBB10_5: ; %frem.loop_body
+; VI-NEXT: .LBB10_5: ; %frem.loop_body93
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -8053,7 +8053,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_5
; VI-NEXT: ; %bb.6: ; %Flow133
; VI-NEXT: v_mov_b32_e32 v7, v9
-; VI-NEXT: .LBB10_7: ; %frem.loop_exit
+; VI-NEXT: .LBB10_7: ; %frem.loop_exit94
; VI-NEXT: v_add_u32_e32 v6, vcc, -10, v6
; VI-NEXT: v_ldexp_f32 v6, v7, v6
; VI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -8073,7 +8073,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v8, |v6|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8
; VI-NEXT: s_cbranch_vccz .LBB10_10
-; VI-NEXT: ; %bb.9: ; %frem.else20
+; VI-NEXT: ; %bb.9: ; %frem.else53
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v7, s2, 0, v5
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8
@@ -8082,7 +8082,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_16
; VI-NEXT: .LBB10_10:
; VI-NEXT: ; implicit-def: $vgpr7
-; VI-NEXT: .LBB10_11: ; %frem.compute19
+; VI-NEXT: .LBB10_11: ; %frem.compute52
; VI-NEXT: v_frexp_exp_i32_f32_e32 v12, v9
; VI-NEXT: v_frexp_mant_f32_e32 v7, v9
; VI-NEXT: v_frexp_mant_f32_e32 v9, v8
@@ -8107,10 +8107,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9
; VI-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; VI-NEXT: v_sub_u32_e32 v9, vcc, v12, v13
; VI-NEXT: v_add_u32_e32 v9, vcc, 11, v9
-; VI-NEXT: .LBB10_13: ; %frem.loop_body27
+; VI-NEXT: .LBB10_13: ; %frem.loop_body60
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v12, v10
; VI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -8125,7 +8125,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_13
; VI-NEXT: ; %bb.14: ; %Flow129
; VI-NEXT: v_mov_b32_e32 v10, v12
-; VI-NEXT: .LBB10_15: ; %frem.loop_exit28
+; VI-NEXT: .LBB10_15: ; %frem.loop_exit61
; VI-NEXT: v_add_u32_e32 v9, vcc, -10, v9
; VI-NEXT: v_ldexp_f32 v9, v10, v9
; VI-NEXT: v_mul_f32_e32 v10, v9, v11
@@ -8143,7 +8143,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v9, |v3|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v10, v9
; VI-NEXT: s_cbranch_vccz .LBB10_18
-; VI-NEXT: ; %bb.17: ; %frem.else53
+; VI-NEXT: ; %bb.17: ; %frem.else20
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v8, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v10, v9
@@ -8152,7 +8152,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_24
; VI-NEXT: .LBB10_18:
; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: .LBB10_19: ; %frem.compute52
+; VI-NEXT: .LBB10_19: ; %frem.compute19
; VI-NEXT: v_frexp_exp_i32_f32_e32 v13, v10
; VI-NEXT: v_frexp_mant_f32_e32 v8, v10
; VI-NEXT: v_frexp_mant_f32_e32 v10, v9
@@ -8177,10 +8177,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v10
; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_23
-; VI-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14
; VI-NEXT: v_add_u32_e32 v10, vcc, 11, v10
-; VI-NEXT: .LBB10_21: ; %frem.loop_body60
+; VI-NEXT: .LBB10_21: ; %frem.loop_body27
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -8195,7 +8195,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_21
; VI-NEXT: ; %bb.22: ; %Flow125
; VI-NEXT: v_mov_b32_e32 v11, v13
-; VI-NEXT: .LBB10_23: ; %frem.loop_exit61
+; VI-NEXT: .LBB10_23: ; %frem.loop_exit28
; VI-NEXT: v_add_u32_e32 v10, vcc, -10, v10
; VI-NEXT: v_ldexp_f32 v10, v11, v10
; VI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -8215,7 +8215,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e64 v12, |v10|
; VI-NEXT: v_cmp_ngt_f32_e32 vcc, v13, v12
; VI-NEXT: s_cbranch_vccz .LBB10_26
-; VI-NEXT: ; %bb.25: ; %frem.else86
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_bfi_b32 v11, s2, 0, v9
; VI-NEXT: v_cmp_eq_f32_e32 vcc, v13, v12
@@ -8224,7 +8224,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB10_32
; VI-NEXT: .LBB10_26:
; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: .LBB10_27: ; %frem.compute85
+; VI-NEXT: .LBB10_27: ; %frem.compute
; VI-NEXT: v_frexp_exp_i32_f32_e32 v16, v13
; VI-NEXT: v_frexp_mant_f32_e32 v11, v13
; VI-NEXT: v_frexp_mant_f32_e32 v13, v12
@@ -8249,10 +8249,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 12, v13
; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB10_31
-; VI-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v13, vcc, 11, v13
-; VI-NEXT: .LBB10_29: ; %frem.loop_body93
+; VI-NEXT: .LBB10_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v16, v14
; VI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -8267,7 +8267,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB10_29
; VI-NEXT: ; %bb.30: ; %Flow
; VI-NEXT: v_mov_b32_e32 v14, v16
-; VI-NEXT: .LBB10_31: ; %frem.loop_exit94
+; VI-NEXT: .LBB10_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v13, vcc, -10, v13
; VI-NEXT: v_ldexp_f32 v13, v14, v13
; VI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -8320,7 +8320,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v5, |v0|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v6, v5
; GFX9-NEXT: s_cbranch_vccz .LBB10_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else86
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v2
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v6, v5
@@ -8329,7 +8329,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_8
; GFX9-NEXT: .LBB10_2:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB10_3: ; %frem.compute
+; GFX9-NEXT: .LBB10_3: ; %frem.compute85
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v9, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v5
@@ -8354,10 +8354,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 11, v6
-; GFX9-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -8372,7 +8372,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_5
; GFX9-NEXT: ; %bb.6: ; %Flow133
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX9-NEXT: v_add_u32_e32 v6, -10, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -8391,7 +8391,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v7, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v8, v7
; GFX9-NEXT: s_cbranch_vccz .LBB10_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else20
+; GFX9-NEXT: ; %bb.9: ; %frem.else53
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v6, s2, 0, v5
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v8, v7
@@ -8400,7 +8400,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_16
; GFX9-NEXT: .LBB10_10:
; GFX9-NEXT: ; implicit-def: $vgpr6
-; GFX9-NEXT: .LBB10_11: ; %frem.compute19
+; GFX9-NEXT: .LBB10_11: ; %frem.compute52
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v11, v8
; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v8
; GFX9-NEXT: v_frexp_mant_f32_e32 v8, v7
@@ -8425,10 +8425,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v8
; GFX9-NEXT: v_div_fixup_f32 v10, v10, v7, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX9-NEXT: v_sub_u32_e32 v8, v11, v12
; GFX9-NEXT: v_add_u32_e32 v8, 11, v8
-; GFX9-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX9-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v11, v9
; GFX9-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -8443,7 +8443,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_13
; GFX9-NEXT: ; %bb.14: ; %Flow129
; GFX9-NEXT: v_mov_b32_e32 v9, v11
-; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX9-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX9-NEXT: v_add_u32_e32 v8, -10, v8
; GFX9-NEXT: v_ldexp_f32 v8, v9, v8
; GFX9-NEXT: v_mul_f32_e32 v9, v8, v10
@@ -8461,7 +8461,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_e64 v8, |v1|
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v9, v8
; GFX9-NEXT: s_cbranch_vccz .LBB10_18
-; GFX9-NEXT: ; %bb.17: ; %frem.else53
+; GFX9-NEXT: ; %bb.17: ; %frem.else20
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v7, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v9, v8
@@ -8470,7 +8470,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_24
; GFX9-NEXT: .LBB10_18:
; GFX9-NEXT: ; implicit-def: $vgpr7
-; GFX9-NEXT: .LBB10_19: ; %frem.compute52
+; GFX9-NEXT: .LBB10_19: ; %frem.compute19
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v12, v9
; GFX9-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v8
@@ -8495,10 +8495,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v9
; GFX9-NEXT: v_div_fixup_f32 v11, v11, v8, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX9-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX9-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX9-NEXT: v_sub_u32_e32 v9, v12, v13
; GFX9-NEXT: v_add_u32_e32 v9, 11, v9
-; GFX9-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX9-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v12, v10
; GFX9-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -8513,7 +8513,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_21
; GFX9-NEXT: ; %bb.22: ; %Flow125
; GFX9-NEXT: v_mov_b32_e32 v10, v12
-; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX9-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX9-NEXT: v_add_u32_e32 v9, -10, v9
; GFX9-NEXT: v_ldexp_f32 v9, v10, v9
; GFX9-NEXT: v_mul_f32_e32 v10, v9, v11
@@ -8532,7 +8532,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cvt_f32_f16_sdwa v10, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, v11, v10
; GFX9-NEXT: s_cbranch_vccz .LBB10_26
-; GFX9-NEXT: ; %bb.25: ; %frem.else86
+; GFX9-NEXT: ; %bb.25: ; %frem.else
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v8
; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, v11, v10
@@ -8541,7 +8541,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB10_32
; GFX9-NEXT: .LBB10_26:
; GFX9-NEXT: ; implicit-def: $vgpr9
-; GFX9-NEXT: .LBB10_27: ; %frem.compute85
+; GFX9-NEXT: .LBB10_27: ; %frem.compute
; GFX9-NEXT: v_frexp_exp_i32_f32_e32 v14, v11
; GFX9-NEXT: v_frexp_mant_f32_e32 v9, v11
; GFX9-NEXT: v_frexp_mant_f32_e32 v11, v10
@@ -8566,10 +8566,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 12, v11
; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX9-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15
; GFX9-NEXT: v_add_u32_e32 v11, 11, v11
-; GFX9-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX9-NEXT: .LBB10_29: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v14, v12
; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -8584,7 +8584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB10_29
; GFX9-NEXT: ; %bb.30: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v12, v14
-; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX9-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v11, -10, v11
; GFX9-NEXT: v_ldexp_f32 v11, v12, v11
; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -8640,7 +8640,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v5, |v0|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX10-NEXT: s_cbranch_vccz .LBB10_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else86
; GFX10-NEXT: v_bfi_b32 v4, 0x7fff, 0, v2
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v4, vcc_lo
@@ -8648,7 +8648,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_8
; GFX10-NEXT: .LBB10_2:
; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: .LBB10_3: ; %frem.compute
+; GFX10-NEXT: .LBB10_3: ; %frem.compute85
; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX10-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -8675,10 +8675,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8694,7 +8694,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow133
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX10-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -8712,7 +8712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v5|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v8, v7
; GFX10-NEXT: s_cbranch_vccz .LBB10_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else20
+; GFX10-NEXT: ; %bb.9: ; %frem.else53
; GFX10-NEXT: v_bfi_b32 v6, 0x7fff, 0, v5
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v8, v7
; GFX10-NEXT: v_cndmask_b32_e32 v6, v5, v6, vcc_lo
@@ -8720,7 +8720,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_16
; GFX10-NEXT: .LBB10_10:
; GFX10-NEXT: ; implicit-def: $vgpr6
-; GFX10-NEXT: .LBB10_11: ; %frem.compute19
+; GFX10-NEXT: .LBB10_11: ; %frem.compute52
; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v8
; GFX10-NEXT: v_frexp_mant_f32_e32 v10, v7
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v9, v8
@@ -8747,10 +8747,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v10
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX10-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v11, v8
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8766,7 +8766,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow129
; GFX10-NEXT: v_mov_b32_e32 v10, s2
; GFX10-NEXT: v_mov_b32_e32 v8, v11
-; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX10-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX10-NEXT: v_add_nc_u32_e32 v10, -10, v10
; GFX10-NEXT: v_ldexp_f32 v8, v8, v10
; GFX10-NEXT: v_mul_f32_e32 v9, v8, v9
@@ -8783,7 +8783,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v8, |v1|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX10-NEXT: s_cbranch_vccz .LBB10_18
-; GFX10-NEXT: ; %bb.17: ; %frem.else53
+; GFX10-NEXT: ; %bb.17: ; %frem.else20
; GFX10-NEXT: v_bfi_b32 v7, 0x7fff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
; GFX10-NEXT: v_cndmask_b32_e32 v7, v3, v7, vcc_lo
@@ -8791,7 +8791,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_24
; GFX10-NEXT: .LBB10_18:
; GFX10-NEXT: ; implicit-def: $vgpr7
-; GFX10-NEXT: .LBB10_19: ; %frem.compute52
+; GFX10-NEXT: .LBB10_19: ; %frem.compute19
; GFX10-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX10-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -8818,10 +8818,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v11
; GFX10-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX10-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX10-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX10-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v12, v9
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8837,7 +8837,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.22: ; %Flow125
; GFX10-NEXT: v_mov_b32_e32 v11, s2
; GFX10-NEXT: v_mov_b32_e32 v9, v12
-; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX10-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX10-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX10-NEXT: v_ldexp_f32 v9, v9, v11
; GFX10-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -8855,7 +8855,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cvt_f32_f16_e64 v11, |v8|
; GFX10-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v11, v10
; GFX10-NEXT: s_cbranch_vccz .LBB10_26
-; GFX10-NEXT: ; %bb.25: ; %frem.else86
+; GFX10-NEXT: ; %bb.25: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v9, 0x7fff, 0, v8
; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, v11, v10
; GFX10-NEXT: v_cndmask_b32_e32 v9, v8, v9, vcc_lo
@@ -8863,7 +8863,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB10_32
; GFX10-NEXT: .LBB10_26:
; GFX10-NEXT: ; implicit-def: $vgpr9
-; GFX10-NEXT: .LBB10_27: ; %frem.compute85
+; GFX10-NEXT: .LBB10_27: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e32 v9, v11
; GFX10-NEXT: v_frexp_mant_f32_e32 v13, v10
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v11
@@ -8890,10 +8890,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v13
; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX10-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 11
-; GFX10-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX10-NEXT: .LBB10_29: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: s_add_i32 s2, s2, -11
@@ -8909,7 +8909,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.30: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v13, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v14
-; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX10-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v13, -10, v13
; GFX10-NEXT: v_ldexp_f32 v11, v11, v13
; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -8963,7 +8963,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_2
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
@@ -8974,7 +8974,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_8
; GFX11-TRUE16-NEXT: .LBB10_2:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4
-; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX11-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -9010,11 +9010,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX11-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v6
@@ -9034,7 +9034,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v8, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX11-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX11-TRUE16-NEXT: v_ldexp_f32 v6, v6, v8
@@ -9061,7 +9061,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_10
-; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
@@ -9072,7 +9072,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_16
; GFX11-TRUE16-NEXT: .LBB10_10:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX11-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -9108,11 +9108,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX11-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, v9
@@ -9132,7 +9132,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v9, v12
-; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX11-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX11-TRUE16-NEXT: v_ldexp_f32 v9, v9, v11
@@ -9156,7 +9156,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_18
-; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX11-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9
@@ -9167,7 +9167,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_24
; GFX11-TRUE16-NEXT: .LBB10_18:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8
-; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX11-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v8, v10
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v12, v9
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10
@@ -9203,11 +9203,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX11-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX11-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v10
@@ -9227,7 +9227,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v12, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX11-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v12, -10, v12
; GFX11-TRUE16-NEXT: v_ldexp_f32 v10, v10, v12
@@ -9254,7 +9254,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12
; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB10_26
-; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX11-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12
@@ -9265,7 +9265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_branch .LBB10_32
; GFX11-TRUE16-NEXT: .LBB10_26:
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11
-; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX11-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v11, v13
; GFX11-TRUE16-NEXT: v_frexp_mant_f32_e32 v15, v12
; GFX11-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13
@@ -9301,11 +9301,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX11-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-TRUE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX11-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v16, v13
@@ -9325,7 +9325,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v15, s2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX11-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v15, -10, v15
; GFX11-TRUE16-NEXT: v_ldexp_f32 v13, v13, v15
@@ -9388,7 +9388,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v6, v5
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_2
-; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX11-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX11-FAKE16-NEXT: v_bfi_b32 v4, 0x7fff, 0, v0
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v6, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9397,7 +9397,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_8
; GFX11-FAKE16-NEXT: .LBB10_2:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr4
-; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX11-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, v6
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v5
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v7, v6
@@ -9433,11 +9433,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX11-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v6
@@ -9457,7 +9457,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v8, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX11-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, -10, v8
; GFX11-FAKE16-NEXT: v_ldexp_f32 v6, v6, v8
@@ -9483,7 +9483,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v9, v8
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_10
-; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX11-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fff, 0, v5
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v9, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9492,7 +9492,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_16
; GFX11-FAKE16-NEXT: .LBB10_10:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr7
-; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX11-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v7, v9
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v8
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v10, v9
@@ -9528,11 +9528,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX11-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX11-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, v9
@@ -9552,7 +9552,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v11, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v9, v12
-; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX11-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, -10, v11
; GFX11-FAKE16-NEXT: v_ldexp_f32 v9, v9, v11
@@ -9575,7 +9575,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v10, v9
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_18
-; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX11-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX11-FAKE16-NEXT: v_bfi_b32 v8, 0x7fff, 0, v1
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v10, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9584,7 +9584,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_24
; GFX11-FAKE16-NEXT: .LBB10_18:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr8
-; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX11-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v8, v10
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v12, v9
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v11, v10
@@ -9620,11 +9620,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX11-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX11-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v10
@@ -9644,7 +9644,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v12, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX11-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, -10, v12
; GFX11-FAKE16-NEXT: v_ldexp_f32 v10, v10, v12
@@ -9670,7 +9670,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v13, v12
; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB10_26
-; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX11-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX11-FAKE16-NEXT: v_bfi_b32 v11, 0x7fff, 0, v9
; GFX11-FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v13, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -9679,7 +9679,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_branch .LBB10_32
; GFX11-FAKE16-NEXT: .LBB10_26:
; GFX11-FAKE16-NEXT: ; implicit-def: $vgpr11
-; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX11-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v11, v13
; GFX11-FAKE16-NEXT: v_frexp_mant_f32_e32 v15, v12
; GFX11-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v14, v13
@@ -9715,11 +9715,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX11-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-FAKE16-NEXT: s_sub_i32 s2, s2, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s2, s2, 11
-; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX11-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v16, v13
@@ -9739,7 +9739,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v15, s2
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX11-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v15, -10, v15
; GFX11-FAKE16-NEXT: v_ldexp_f32 v13, v13, v15
@@ -9804,7 +9804,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
@@ -9816,7 +9816,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_8
; GFX1150-TRUE16-NEXT: .LBB10_2:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1150-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -9851,11 +9851,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s6, s8, s6
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s6, s6, 11
-; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1150-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -9877,7 +9877,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1150-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -9907,7 +9907,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -9919,7 +9919,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_16
; GFX1150-TRUE16-NEXT: .LBB10_10:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1150-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -9954,11 +9954,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1150-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -9980,7 +9980,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1150-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10008,7 +10008,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1150-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10020,7 +10020,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_24
; GFX1150-TRUE16-NEXT: .LBB10_18:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1150-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10055,11 +10055,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1150-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -10081,7 +10081,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1150-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -10111,7 +10111,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1150-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1150-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX1150-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX1150-TRUE16-NEXT: s_cmp_eq_f32 s12, s11
@@ -10123,7 +10123,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: s_branch .LBB10_32
; GFX1150-TRUE16-NEXT: .LBB10_26:
; GFX1150-TRUE16-NEXT: ; implicit-def: $vgpr3
-; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1150-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1150-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1150-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -10158,11 +10158,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1150-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1150-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-TRUE16-NEXT: s_sub_i32 s11, s12, s11
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: s_add_i32 s11, s11, 11
-; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1150-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1150-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v8, v5
@@ -10184,7 +10184,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-TRUE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1150-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1150-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1150-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -10265,7 +10265,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1150-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10275,7 +10275,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_8
; GFX1150-FAKE16-NEXT: .LBB10_2:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1150-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -10310,11 +10310,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s6, s8, s6
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s6, s6, 11
-; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1150-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -10336,7 +10336,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1150-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -10365,7 +10365,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1150-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1150-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10375,7 +10375,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_16
; GFX1150-FAKE16-NEXT: .LBB10_10:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1150-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -10410,11 +10410,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1150-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -10436,7 +10436,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1150-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10463,7 +10463,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1150-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1150-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10473,7 +10473,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_24
; GFX1150-FAKE16-NEXT: .LBB10_18:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1150-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10508,11 +10508,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s9, s10, s9
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s9, s9, 11
-; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1150-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -10534,7 +10534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1150-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -10563,7 +10563,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1150-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1150-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1150-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX1150-FAKE16-NEXT: s_cmp_eq_f32 s12, s11
; GFX1150-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10
; GFX1150-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -10573,7 +10573,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: s_branch .LBB10_32
; GFX1150-FAKE16-NEXT: .LBB10_26:
; GFX1150-FAKE16-NEXT: ; implicit-def: $vgpr3
-; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1150-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1150-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1150-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -10608,11 +10608,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1150-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1150-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-FAKE16-NEXT: s_sub_i32 s11, s12, s11
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: s_add_i32 s11, s11, 11
-; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1150-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1150-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v8, v5
@@ -10634,7 +10634,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-FAKE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1150-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1150-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1150-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -10712,7 +10712,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-TRUE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v0.l, s5
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s8, s6
@@ -10724,7 +10724,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_8
; GFX1200-TRUE16-NEXT: .LBB10_2:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1200-TRUE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -10759,11 +10759,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s6, s8, s6
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s6, s6, 11
-; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1200-TRUE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v2
@@ -10787,7 +10787,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.6: ; %Flow133
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1200-TRUE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -10821,7 +10821,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-TRUE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v1.l, s8
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10833,7 +10833,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_16
; GFX1200-TRUE16-NEXT: .LBB10_10:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1200-TRUE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -10869,11 +10869,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1200-TRUE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, v3
@@ -10897,7 +10897,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.14: ; %Flow129
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1200-TRUE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -10928,7 +10928,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1200-TRUE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v2.l, s7
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s10, s9
@@ -10941,7 +10941,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_24
; GFX1200-TRUE16-NEXT: .LBB10_18:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr2
-; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1200-TRUE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -10977,11 +10977,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1200-TRUE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, v4
@@ -11005,7 +11005,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.22: ; %Flow125
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1200-TRUE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -11039,7 +11039,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-TRUE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1200-TRUE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1200-TRUE16-NEXT: ; %bb.25: ; %frem.else
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v3.l, s10
; GFX1200-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
; GFX1200-TRUE16-NEXT: s_cmp_eq_f32 s12, s11
@@ -11051,7 +11051,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: s_branch .LBB10_32
; GFX1200-TRUE16-NEXT: .LBB10_26:
; GFX1200-TRUE16-NEXT: ; implicit-def: $vgpr3
-; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1200-TRUE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1200-TRUE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1200-TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -11087,11 +11087,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1200-TRUE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-TRUE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1200-TRUE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-TRUE16-NEXT: s_sub_co_i32 s11, s12, s11
; GFX1200-TRUE16-NEXT: s_wait_alu 0xfffe
; GFX1200-TRUE16-NEXT: s_add_co_i32 s11, s11, 11
-; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1200-TRUE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1200-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v8, v5
@@ -11115,7 +11115,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-TRUE16-NEXT: ; %bb.30: ; %Flow
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-TRUE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1200-TRUE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1200-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-TRUE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1200-TRUE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -11203,7 +11203,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s8, s6
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_2
-; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-FAKE16-NEXT: ; %bb.1: ; %frem.else86
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s8, s6
; GFX1200-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, 0, s5
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11213,7 +11213,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_8
; GFX1200-FAKE16-NEXT: .LBB10_2:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr0
-; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute
+; GFX1200-FAKE16-NEXT: .LBB10_3: ; %frem.compute85
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s6
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v0, s8
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -11249,11 +11249,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v4
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_7
-; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.4: ; %frem.loop_body93.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s6, s8, s6
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s6, s6, 11
-; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body
+; GFX1200-FAKE16-NEXT: .LBB10_5: ; %frem.loop_body93
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v2
@@ -11277,7 +11277,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.6: ; %Flow133
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, s6
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit
+; GFX1200-FAKE16-NEXT: .LBB10_7: ; %frem.loop_exit94
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v4, -10, v4
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v2, v2, v4
@@ -11310,7 +11310,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_10
-; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else20
+; GFX1200-FAKE16-NEXT: ; %bb.9: ; %frem.else53
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1200-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, 0, s8
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11321,7 +11321,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_16
; GFX1200-FAKE16-NEXT: .LBB10_10:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr1
-; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute19
+; GFX1200-FAKE16-NEXT: .LBB10_11: ; %frem.compute52
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s9
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v1, s10
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -11357,11 +11357,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v5
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_15
-; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body27.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.12: ; %frem.loop_body60.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body27
+; GFX1200-FAKE16-NEXT: .LBB10_13: ; %frem.loop_body60
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, v3
@@ -11385,7 +11385,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.14: ; %Flow129
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, s9
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit28
+; GFX1200-FAKE16-NEXT: .LBB10_15: ; %frem.loop_exit61
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v5, -10, v5
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v3, v3, v5
@@ -11415,7 +11415,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s10, s9
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_18
-; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else53
+; GFX1200-FAKE16-NEXT: ; %bb.17: ; %frem.else20
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s10, s9
; GFX1200-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, 0, s7
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11426,7 +11426,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_24
; GFX1200-FAKE16-NEXT: .LBB10_18:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr2
-; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute52
+; GFX1200-FAKE16-NEXT: .LBB10_19: ; %frem.compute19
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s9
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v2, s10
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v5, s10
@@ -11462,11 +11462,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v6
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_23
-; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body60.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.20: ; %frem.loop_body27.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s9, s10, s9
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s9, s9, 11
-; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body60
+; GFX1200-FAKE16-NEXT: .LBB10_21: ; %frem.loop_body27
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, v4
@@ -11490,7 +11490,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.22: ; %Flow125
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v6, s9
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit61
+; GFX1200-FAKE16-NEXT: .LBB10_23: ; %frem.loop_exit28
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v6, -10, v6
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v4, v4, v6
@@ -11523,7 +11523,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1200-FAKE16-NEXT: s_cmp_ngt_f32 s12, s11
; GFX1200-FAKE16-NEXT: s_cbranch_scc0 .LBB10_26
-; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else86
+; GFX1200-FAKE16-NEXT: ; %bb.25: ; %frem.else
; GFX1200-FAKE16-NEXT: s_cmp_eq_f32 s12, s11
; GFX1200-FAKE16-NEXT: v_bfi_b32 v3, 0x7fff, 0, s10
; GFX1200-FAKE16-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -11534,7 +11534,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: s_branch .LBB10_32
; GFX1200-FAKE16-NEXT: .LBB10_26:
; GFX1200-FAKE16-NEXT: ; implicit-def: $vgpr3
-; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute85
+; GFX1200-FAKE16-NEXT: .LBB10_27: ; %frem.compute
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v4, s11
; GFX1200-FAKE16-NEXT: v_frexp_mant_f32_e32 v3, s12
; GFX1200-FAKE16-NEXT: v_frexp_exp_i32_f32_e32 v6, s12
@@ -11570,11 +11570,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 12, v7
; GFX1200-FAKE16-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-FAKE16-NEXT: s_cbranch_vccnz .LBB10_31
-; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body93.preheader
+; GFX1200-FAKE16-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-FAKE16-NEXT: s_sub_co_i32 s11, s12, s11
; GFX1200-FAKE16-NEXT: s_wait_alu 0xfffe
; GFX1200-FAKE16-NEXT: s_add_co_i32 s11, s11, 11
-; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body93
+; GFX1200-FAKE16-NEXT: .LBB10_29: ; %frem.loop_body
; GFX1200-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v8, v5
@@ -11598,7 +11598,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-FAKE16-NEXT: ; %bb.30: ; %Flow
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-FAKE16-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit94
+; GFX1200-FAKE16-NEXT: .LBB10_31: ; %frem.loop_exit
; GFX1200-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-FAKE16-NEXT: v_add_nc_u32_e32 v7, -10, v7
; GFX1200-FAKE16-NEXT: v_ldexp_f32 v5, v5, v7
@@ -11686,7 +11686,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB11_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else16
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v4, s2, 0, v0
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -11697,7 +11697,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB11_2:
; SI-NEXT: ; implicit-def: $vgpr4
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB11_3: ; %frem.compute
+; SI-NEXT: .LBB11_3: ; %frem.compute15
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v4, v0
@@ -11733,10 +11733,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB11_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB11_5: ; %frem.loop_body
+; SI-NEXT: .LBB11_5: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v7, v5
; SI-NEXT: v_mul_f32_e32 v5, v7, v6
@@ -11751,7 +11751,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB11_5
; SI-NEXT: ; %bb.6: ; %Flow51
; SI-NEXT: v_mov_b32_e32 v5, v7
-; SI-NEXT: .LBB11_7: ; %frem.loop_exit
+; SI-NEXT: .LBB11_7: ; %frem.loop_exit24
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v5, v5, s3
; SI-NEXT: v_mul_f32_e32 v6, v5, v6
@@ -11767,7 +11767,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB11_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v5, s2, 0, v1
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -11778,7 +11778,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB11_10:
; SI-NEXT: ; implicit-def: $vgpr5
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB11_11: ; %frem.compute15
+; SI-NEXT: .LBB11_11: ; %frem.compute
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v5, v1
@@ -11814,10 +11814,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB11_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB11_13: ; %frem.loop_body23
+; SI-NEXT: .LBB11_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v8, v6
; SI-NEXT: v_mul_f32_e32 v6, v8, v7
@@ -11832,7 +11832,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB11_13
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v6, v8
-; SI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB11_15: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v6, v6, s3
; SI-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -11877,7 +11877,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB11_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v4, s2, 0, v0
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -11886,7 +11886,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_8
; CI-NEXT: .LBB11_2:
; CI-NEXT: ; implicit-def: $vgpr4
-; CI-NEXT: .LBB11_3: ; %frem.compute
+; CI-NEXT: .LBB11_3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; CI-NEXT: v_ldexp_f32_e64 v5, v5, 1
; CI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -11911,10 +11911,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; CI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v6, vcc, v9, v10
; CI-NEXT: v_add_i32_e32 v6, vcc, 12, v6
-; CI-NEXT: .LBB11_5: ; %frem.loop_body
+; CI-NEXT: .LBB11_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v9, v7
; CI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -11929,7 +11929,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB11_5
; CI-NEXT: ; %bb.6: ; %Flow51
; CI-NEXT: v_mov_b32_e32 v7, v9
-; CI-NEXT: .LBB11_7: ; %frem.loop_exit
+; CI-NEXT: .LBB11_7: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v6, vcc, -11, v6
; CI-NEXT: v_ldexp_f32_e32 v6, v7, v6
; CI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -11945,7 +11945,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB11_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v5, s2, 0, v1
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -11954,7 +11954,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB11_16
; CI-NEXT: .LBB11_10:
; CI-NEXT: ; implicit-def: $vgpr5
-; CI-NEXT: .LBB11_11: ; %frem.compute15
+; CI-NEXT: .LBB11_11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; CI-NEXT: v_ldexp_f32_e64 v6, v6, 1
; CI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -11979,10 +11979,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; CI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB11_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v7, vcc, v10, v11
; CI-NEXT: v_add_i32_e32 v7, vcc, 12, v7
-; CI-NEXT: .LBB11_13: ; %frem.loop_body23
+; CI-NEXT: .LBB11_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v10, v8
; CI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -11997,7 +11997,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB11_13
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v8, v10
-; CI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB11_15: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v7, vcc, -11, v7
; CI-NEXT: v_ldexp_f32_e32 v7, v8, v7
; CI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12042,7 +12042,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB11_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v4, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -12051,7 +12051,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_8
; VI-NEXT: .LBB11_2:
; VI-NEXT: ; implicit-def: $vgpr4
-; VI-NEXT: .LBB11_3: ; %frem.compute
+; VI-NEXT: .LBB11_3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; VI-NEXT: v_ldexp_f32 v5, v5, 1
; VI-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -12076,10 +12076,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; VI-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v6, vcc, v9, v10
; VI-NEXT: v_add_u32_e32 v6, vcc, 12, v6
-; VI-NEXT: .LBB11_5: ; %frem.loop_body
+; VI-NEXT: .LBB11_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v9, v7
; VI-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -12094,7 +12094,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB11_5
; VI-NEXT: ; %bb.6: ; %Flow51
; VI-NEXT: v_mov_b32_e32 v7, v9
-; VI-NEXT: .LBB11_7: ; %frem.loop_exit
+; VI-NEXT: .LBB11_7: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v6, vcc, -11, v6
; VI-NEXT: v_ldexp_f32 v6, v7, v6
; VI-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -12110,7 +12110,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB11_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v5, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -12119,7 +12119,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB11_16
; VI-NEXT: .LBB11_10:
; VI-NEXT: ; implicit-def: $vgpr5
-; VI-NEXT: .LBB11_11: ; %frem.compute15
+; VI-NEXT: .LBB11_11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; VI-NEXT: v_ldexp_f32 v6, v6, 1
; VI-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -12144,10 +12144,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; VI-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB11_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v7, vcc, v10, v11
; VI-NEXT: v_add_u32_e32 v7, vcc, 12, v7
-; VI-NEXT: .LBB11_13: ; %frem.loop_body23
+; VI-NEXT: .LBB11_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v10, v8
; VI-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -12162,7 +12162,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB11_13
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v8, v10
-; VI-NEXT: .LBB11_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB11_15: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v7, vcc, -11, v7
; VI-NEXT: v_ldexp_f32 v7, v8, v7
; VI-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12202,7 +12202,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v2|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB11_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else16
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v4, s2, 0, v0
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v2|
@@ -12211,7 +12211,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB11_8
; GFX9-NEXT: .LBB11_2:
; GFX9-NEXT: ; implicit-def: $vgpr4
-; GFX9-NEXT: .LBB11_3: ; %frem.compute
+; GFX9-NEXT: .LBB11_3: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX9-NEXT: v_ldexp_f32 v5, v5, 1
; GFX9-NEXT: v_div_scale_f32 v11, s[2:3], v5, v5, 1.0
@@ -12236,10 +12236,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v6
; GFX9-NEXT: v_div_fixup_f32 v8, v8, v5, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v6, v9, v10
; GFX9-NEXT: v_add_u32_e32 v6, 12, v6
-; GFX9-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v9, v7
; GFX9-NEXT: v_mul_f32_e32 v7, v9, v8
@@ -12254,7 +12254,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB11_5
; GFX9-NEXT: ; %bb.6: ; %Flow51
; GFX9-NEXT: v_mov_b32_e32 v7, v9
-; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX9-NEXT: v_add_u32_e32 v6, -11, v6
; GFX9-NEXT: v_ldexp_f32 v6, v7, v6
; GFX9-NEXT: v_mul_f32_e32 v7, v6, v8
@@ -12270,7 +12270,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v3|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB11_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v5, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v3|
@@ -12279,7 +12279,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB11_16
; GFX9-NEXT: .LBB11_10:
; GFX9-NEXT: ; implicit-def: $vgpr5
-; GFX9-NEXT: .LBB11_11: ; %frem.compute15
+; GFX9-NEXT: .LBB11_11: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX9-NEXT: v_ldexp_f32 v6, v6, 1
; GFX9-NEXT: v_div_scale_f32 v12, s[2:3], v6, v6, 1.0
@@ -12304,10 +12304,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v7
; GFX9-NEXT: v_div_fixup_f32 v9, v9, v6, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v7, v10, v11
; GFX9-NEXT: v_add_u32_e32 v7, 12, v7
-; GFX9-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB11_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v10, v8
; GFX9-NEXT: v_mul_f32_e32 v8, v10, v9
@@ -12322,7 +12322,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB11_13
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v8, v10
-; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v7, -11, v7
; GFX9-NEXT: v_ldexp_f32 v7, v8, v7
; GFX9-NEXT: v_mul_f32_e32 v8, v7, v9
@@ -12363,7 +12363,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB11_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else16
; GFX10-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
; GFX10-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc_lo
@@ -12371,7 +12371,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB11_8
; GFX10-NEXT: .LBB11_2:
; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: .LBB11_3: ; %frem.compute
+; GFX10-NEXT: .LBB11_3: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX10-NEXT: v_frexp_mant_f32_e64 v4, |v0|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
@@ -12398,10 +12398,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v8
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v9, v6
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -12417,7 +12417,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow51
; GFX10-NEXT: v_mov_b32_e32 v8, s2
; GFX10-NEXT: v_mov_b32_e32 v6, v9
-; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX10-NEXT: v_add_nc_u32_e32 v8, -11, v8
; GFX10-NEXT: v_ldexp_f32 v6, v6, v8
; GFX10-NEXT: v_mul_f32_e32 v7, v6, v7
@@ -12432,7 +12432,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB11_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
; GFX10-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc_lo
@@ -12440,7 +12440,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB11_16
; GFX10-NEXT: .LBB11_10:
; GFX10-NEXT: ; implicit-def: $vgpr5
-; GFX10-NEXT: .LBB11_11: ; %frem.compute15
+; GFX10-NEXT: .LBB11_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX10-NEXT: v_frexp_mant_f32_e64 v5, |v1|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v8, v1
@@ -12467,10 +12467,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v9
; GFX10-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB11_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v10, v7
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -12486,7 +12486,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v9, s2
; GFX10-NEXT: v_mov_b32_e32 v7, v10
-; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v9, -11, v9
; GFX10-NEXT: v_ldexp_f32 v7, v7, v9
; GFX10-NEXT: v_mul_f32_e32 v8, v7, v8
@@ -12524,7 +12524,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v2|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB11_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else16
; GFX11-NEXT: v_bfi_b32 v4, 0x7fffffff, 0, v0
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -12533,7 +12533,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB11_8
; GFX11-NEXT: .LBB11_2:
; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: .LBB11_3: ; %frem.compute
+; GFX11-NEXT: .LBB11_3: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v2|
; GFX11-NEXT: v_frexp_mant_f32_e64 v4, |v0|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v7, v0
@@ -12569,11 +12569,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v7, v7, v5, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v9, v6
@@ -12593,7 +12593,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow51
; GFX11-NEXT: v_mov_b32_e32 v8, s2
; GFX11-NEXT: v_mov_b32_e32 v6, v9
-; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v8, -11, v8
; GFX11-NEXT: v_ldexp_f32 v6, v6, v8
@@ -12613,7 +12613,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v3|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB11_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else
; GFX11-NEXT: v_bfi_b32 v5, 0x7fffffff, 0, v1
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -12622,7 +12622,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB11_16
; GFX11-NEXT: .LBB11_10:
; GFX11-NEXT: ; implicit-def: $vgpr5
-; GFX11-NEXT: .LBB11_11: ; %frem.compute15
+; GFX11-NEXT: .LBB11_11: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f32_e64 v6, |v3|
; GFX11-NEXT: v_frexp_mant_f32_e64 v5, |v1|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v8, v1
@@ -12658,11 +12658,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v8, v8, v6, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB11_13: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v10, v7
@@ -12682,7 +12682,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow
; GFX11-NEXT: v_mov_b32_e32 v9, s2
; GFX11-NEXT: v_mov_b32_e32 v7, v10
-; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v9, -11, v9
; GFX11-NEXT: v_ldexp_f32 v7, v7, v9
@@ -12730,7 +12730,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s3, s8
; GFX1150-NEXT: s_cbranch_scc0 .LBB11_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else16
; GFX1150-NEXT: s_cmp_eq_f32 s3, s8
; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12740,7 +12740,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB11_8
; GFX1150-NEXT: .LBB11_2:
; GFX1150-NEXT: ; implicit-def: $vgpr0
-; GFX1150-NEXT: .LBB11_3: ; %frem.compute
+; GFX1150-NEXT: .LBB11_3: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s6|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -12775,11 +12775,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s7, s7, s8
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s7, s7, 12
-; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v5, v2
@@ -12801,7 +12801,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow51
; GFX1150-NEXT: v_mov_b32_e32 v4, s7
; GFX1150-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4
@@ -12824,7 +12824,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s6, s8
; GFX1150-NEXT: s_cbranch_scc0 .LBB11_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else
; GFX1150-NEXT: s_cmp_eq_f32 s6, s8
; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12834,7 +12834,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB11_16
; GFX1150-NEXT: .LBB11_10:
; GFX1150-NEXT: ; implicit-def: $vgpr1
-; GFX1150-NEXT: .LBB11_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB11_11: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s2|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s5|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s5
@@ -12869,11 +12869,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s7, s7, s8
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s7, s7, 12
-; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB11_13: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v6, v3
@@ -12895,7 +12895,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow
; GFX1150-NEXT: v_mov_b32_e32 v5, s7
; GFX1150-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5
@@ -12950,7 +12950,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_cmp_ngt_f32 s3, s8
; GFX1200-NEXT: s_cbranch_scc0 .LBB11_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else16
; GFX1200-NEXT: s_cmp_eq_f32 s3, s8
; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s6
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -12960,7 +12960,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB11_8
; GFX1200-NEXT: .LBB11_2:
; GFX1200-NEXT: ; implicit-def: $vgpr0
-; GFX1200-NEXT: .LBB11_3: ; %frem.compute
+; GFX1200-NEXT: .LBB11_3: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s4|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s6|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s6
@@ -12996,11 +12996,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB11_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB11_5: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v5, v2
@@ -13024,7 +13024,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow51
; GFX1200-NEXT: v_mov_b32_e32 v4, s7
; GFX1200-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB11_7: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4
@@ -13048,7 +13048,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s6, s8
; GFX1200-NEXT: s_cbranch_scc0 .LBB11_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else
; GFX1200-NEXT: s_cmp_eq_f32 s6, s8
; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s5
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -13059,7 +13059,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB11_16
; GFX1200-NEXT: .LBB11_10:
; GFX1200-NEXT: ; implicit-def: $vgpr1
-; GFX1200-NEXT: .LBB11_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB11_11: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s2|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s5|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s5
@@ -13095,11 +13095,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB11_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s7, s7, s8
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB11_13: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v6, v3
@@ -13123,7 +13123,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow
; GFX1200-NEXT: v_mov_b32_e32 v5, s7
; GFX1200-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB11_15: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5
@@ -13187,7 +13187,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else78
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v8, s2, 0, v0
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13198,7 +13198,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_2:
; SI-NEXT: ; implicit-def: $vgpr8
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_3: ; %frem.compute
+; SI-NEXT: .LBB12_3: ; %frem.compute77
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v0|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v8, v0
@@ -13234,10 +13234,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v10, v10, v8, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_5: ; %frem.loop_body
+; SI-NEXT: .LBB12_5: ; %frem.loop_body85
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v11, v9
; SI-NEXT: v_mul_f32_e32 v9, v11, v10
@@ -13252,7 +13252,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_5
; SI-NEXT: ; %bb.6: ; %Flow125
; SI-NEXT: v_mov_b32_e32 v9, v11
-; SI-NEXT: .LBB12_7: ; %frem.loop_exit
+; SI-NEXT: .LBB12_7: ; %frem.loop_exit86
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v9, v9, s3
; SI-NEXT: v_mul_f32_e32 v10, v9, v10
@@ -13268,7 +13268,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else47
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v9, s2, 0, v1
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13279,7 +13279,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_10:
; SI-NEXT: ; implicit-def: $vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_11: ; %frem.compute15
+; SI-NEXT: .LBB12_11: ; %frem.compute46
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v1|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v9, v1
@@ -13315,10 +13315,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_13: ; %frem.loop_body23
+; SI-NEXT: .LBB12_13: ; %frem.loop_body54
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v12, v10
; SI-NEXT: v_mul_f32_e32 v10, v12, v11
@@ -13333,7 +13333,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_13
; SI-NEXT: ; %bb.14: ; %Flow121
; SI-NEXT: v_mov_b32_e32 v10, v12
-; SI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB12_15: ; %frem.loop_exit55
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v10, v10, s3
; SI-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -13349,7 +13349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_18
-; SI-NEXT: ; %bb.17: ; %frem.else47
+; SI-NEXT: ; %bb.17: ; %frem.else16
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v10, s2, 0, v2
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -13360,7 +13360,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_18:
; SI-NEXT: ; implicit-def: $vgpr10
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_19: ; %frem.compute46
+; SI-NEXT: .LBB12_19: ; %frem.compute15
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v2|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v10, v2
@@ -13396,10 +13396,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_23
-; SI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; SI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_21: ; %frem.loop_body54
+; SI-NEXT: .LBB12_21: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v13, v11
; SI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13414,7 +13414,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_21
; SI-NEXT: ; %bb.22: ; %Flow117
; SI-NEXT: v_mov_b32_e32 v11, v13
-; SI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; SI-NEXT: .LBB12_23: ; %frem.loop_exit24
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v11, v11, s3
; SI-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -13430,7 +13430,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; SI-NEXT: s_and_b64 vcc, exec, s[2:3]
; SI-NEXT: s_cbranch_vccz .LBB12_26
-; SI-NEXT: ; %bb.25: ; %frem.else78
+; SI-NEXT: ; %bb.25: ; %frem.else
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_bfi_b32 v11, s2, 0, v3
; SI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -13441,7 +13441,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB12_26:
; SI-NEXT: ; implicit-def: $vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB12_27: ; %frem.compute77
+; SI-NEXT: .LBB12_27: ; %frem.compute
; SI-NEXT: s_mov_b32 s6, 0x7f800000
; SI-NEXT: v_cmp_lt_f32_e64 s[2:3], |v3|, s6
; SI-NEXT: v_frexp_exp_i32_f32_e32 v11, v3
@@ -13477,10 +13477,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; SI-NEXT: s_cmp_lt_i32 s3, 13
; SI-NEXT: s_cbranch_scc1 .LBB12_31
-; SI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; SI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s3, s4, s5
; SI-NEXT: s_add_i32 s3, s3, 12
-; SI-NEXT: .LBB12_29: ; %frem.loop_body85
+; SI-NEXT: .LBB12_29: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v14, v12
; SI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13495,7 +13495,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: s_cbranch_scc1 .LBB12_29
; SI-NEXT: ; %bb.30: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v14
-; SI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; SI-NEXT: .LBB12_31: ; %frem.loop_exit
; SI-NEXT: s_add_i32 s3, s3, -11
; SI-NEXT: v_ldexp_f32_e64 v12, v12, s3
; SI-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -13548,7 +13548,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else78
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v8, s2, 0, v0
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13557,7 +13557,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_8
; CI-NEXT: .LBB12_2:
; CI-NEXT: ; implicit-def: $vgpr8
-; CI-NEXT: .LBB12_3: ; %frem.compute
+; CI-NEXT: .LBB12_3: ; %frem.compute77
; CI-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; CI-NEXT: v_ldexp_f32_e64 v9, v9, 1
; CI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -13582,10 +13582,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; CI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; CI-NEXT: v_sub_i32_e32 v10, vcc, v13, v14
; CI-NEXT: v_add_i32_e32 v10, vcc, 12, v10
-; CI-NEXT: .LBB12_5: ; %frem.loop_body
+; CI-NEXT: .LBB12_5: ; %frem.loop_body85
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v13, v11
; CI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13600,7 +13600,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_5
; CI-NEXT: ; %bb.6: ; %Flow125
; CI-NEXT: v_mov_b32_e32 v11, v13
-; CI-NEXT: .LBB12_7: ; %frem.loop_exit
+; CI-NEXT: .LBB12_7: ; %frem.loop_exit86
; CI-NEXT: v_add_i32_e32 v10, vcc, -11, v10
; CI-NEXT: v_ldexp_f32_e32 v10, v11, v10
; CI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -13616,7 +13616,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else47
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v9, s2, 0, v1
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13625,7 +13625,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_16
; CI-NEXT: .LBB12_10:
; CI-NEXT: ; implicit-def: $vgpr9
-; CI-NEXT: .LBB12_11: ; %frem.compute15
+; CI-NEXT: .LBB12_11: ; %frem.compute46
; CI-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; CI-NEXT: v_ldexp_f32_e64 v10, v10, 1
; CI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -13650,10 +13650,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; CI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; CI-NEXT: v_sub_i32_e32 v11, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v11, vcc, 12, v11
-; CI-NEXT: .LBB12_13: ; %frem.loop_body23
+; CI-NEXT: .LBB12_13: ; %frem.loop_body54
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v14, v12
; CI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13668,7 +13668,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_13
; CI-NEXT: ; %bb.14: ; %Flow121
; CI-NEXT: v_mov_b32_e32 v12, v14
-; CI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB12_15: ; %frem.loop_exit55
; CI-NEXT: v_add_i32_e32 v11, vcc, -11, v11
; CI-NEXT: v_ldexp_f32_e32 v11, v12, v11
; CI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -13684,7 +13684,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_18
-; CI-NEXT: ; %bb.17: ; %frem.else47
+; CI-NEXT: ; %bb.17: ; %frem.else16
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v10, s2, 0, v2
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -13693,7 +13693,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_24
; CI-NEXT: .LBB12_18:
; CI-NEXT: ; implicit-def: $vgpr10
-; CI-NEXT: .LBB12_19: ; %frem.compute46
+; CI-NEXT: .LBB12_19: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; CI-NEXT: v_ldexp_f32_e64 v11, v11, 1
; CI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -13718,10 +13718,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; CI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_23
-; CI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; CI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v12, vcc, v15, v16
; CI-NEXT: v_add_i32_e32 v12, vcc, 12, v12
-; CI-NEXT: .LBB12_21: ; %frem.loop_body54
+; CI-NEXT: .LBB12_21: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v13
; CI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -13736,7 +13736,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_21
; CI-NEXT: ; %bb.22: ; %Flow117
; CI-NEXT: v_mov_b32_e32 v13, v15
-; CI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; CI-NEXT: .LBB12_23: ; %frem.loop_exit24
; CI-NEXT: v_add_i32_e32 v12, vcc, -11, v12
; CI-NEXT: v_ldexp_f32_e32 v12, v13, v12
; CI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -13752,7 +13752,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB12_26
-; CI-NEXT: ; %bb.25: ; %frem.else78
+; CI-NEXT: ; %bb.25: ; %frem.else
; CI-NEXT: s_brev_b32 s2, -2
; CI-NEXT: v_bfi_b32 v11, s2, 0, v3
; CI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -13761,7 +13761,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB12_32
; CI-NEXT: .LBB12_26:
; CI-NEXT: ; implicit-def: $vgpr11
-; CI-NEXT: .LBB12_27: ; %frem.compute77
+; CI-NEXT: .LBB12_27: ; %frem.compute
; CI-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; CI-NEXT: v_ldexp_f32_e64 v12, v12, 1
; CI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -13786,10 +13786,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; CI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; CI-NEXT: s_cbranch_vccnz .LBB12_31
-; CI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; CI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v13, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v13, vcc, 12, v13
-; CI-NEXT: .LBB12_29: ; %frem.loop_body85
+; CI-NEXT: .LBB12_29: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v16, v14
; CI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -13804,7 +13804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_cbranch_vccnz .LBB12_29
; CI-NEXT: ; %bb.30: ; %Flow
; CI-NEXT: v_mov_b32_e32 v14, v16
-; CI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; CI-NEXT: .LBB12_31: ; %frem.loop_exit
; CI-NEXT: v_add_i32_e32 v13, vcc, -11, v13
; CI-NEXT: v_ldexp_f32_e32 v13, v14, v13
; CI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -13857,7 +13857,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else78
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v8, s2, 0, v0
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -13866,7 +13866,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_8
; VI-NEXT: .LBB12_2:
; VI-NEXT: ; implicit-def: $vgpr8
-; VI-NEXT: .LBB12_3: ; %frem.compute
+; VI-NEXT: .LBB12_3: ; %frem.compute77
; VI-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; VI-NEXT: v_ldexp_f32 v9, v9, 1
; VI-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -13891,10 +13891,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; VI-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; VI-NEXT: v_sub_u32_e32 v10, vcc, v13, v14
; VI-NEXT: v_add_u32_e32 v10, vcc, 12, v10
-; VI-NEXT: .LBB12_5: ; %frem.loop_body
+; VI-NEXT: .LBB12_5: ; %frem.loop_body85
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v13, v11
; VI-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -13909,7 +13909,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_5
; VI-NEXT: ; %bb.6: ; %Flow125
; VI-NEXT: v_mov_b32_e32 v11, v13
-; VI-NEXT: .LBB12_7: ; %frem.loop_exit
+; VI-NEXT: .LBB12_7: ; %frem.loop_exit86
; VI-NEXT: v_add_u32_e32 v10, vcc, -11, v10
; VI-NEXT: v_ldexp_f32 v10, v11, v10
; VI-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -13925,7 +13925,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else47
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v9, s2, 0, v1
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -13934,7 +13934,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_16
; VI-NEXT: .LBB12_10:
; VI-NEXT: ; implicit-def: $vgpr9
-; VI-NEXT: .LBB12_11: ; %frem.compute15
+; VI-NEXT: .LBB12_11: ; %frem.compute46
; VI-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; VI-NEXT: v_ldexp_f32 v10, v10, 1
; VI-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -13959,10 +13959,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; VI-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; VI-NEXT: v_sub_u32_e32 v11, vcc, v14, v15
; VI-NEXT: v_add_u32_e32 v11, vcc, 12, v11
-; VI-NEXT: .LBB12_13: ; %frem.loop_body23
+; VI-NEXT: .LBB12_13: ; %frem.loop_body54
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v14, v12
; VI-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -13977,7 +13977,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_13
; VI-NEXT: ; %bb.14: ; %Flow121
; VI-NEXT: v_mov_b32_e32 v12, v14
-; VI-NEXT: .LBB12_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB12_15: ; %frem.loop_exit55
; VI-NEXT: v_add_u32_e32 v11, vcc, -11, v11
; VI-NEXT: v_ldexp_f32 v11, v12, v11
; VI-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -13993,7 +13993,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_18
-; VI-NEXT: ; %bb.17: ; %frem.else47
+; VI-NEXT: ; %bb.17: ; %frem.else16
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v10, s2, 0, v2
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -14002,7 +14002,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_24
; VI-NEXT: .LBB12_18:
; VI-NEXT: ; implicit-def: $vgpr10
-; VI-NEXT: .LBB12_19: ; %frem.compute46
+; VI-NEXT: .LBB12_19: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; VI-NEXT: v_ldexp_f32 v11, v11, 1
; VI-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -14027,10 +14027,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; VI-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_23
-; VI-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; VI-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v12, vcc, v15, v16
; VI-NEXT: v_add_u32_e32 v12, vcc, 12, v12
-; VI-NEXT: .LBB12_21: ; %frem.loop_body54
+; VI-NEXT: .LBB12_21: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v15, v13
; VI-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -14045,7 +14045,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_21
; VI-NEXT: ; %bb.22: ; %Flow117
; VI-NEXT: v_mov_b32_e32 v13, v15
-; VI-NEXT: .LBB12_23: ; %frem.loop_exit55
+; VI-NEXT: .LBB12_23: ; %frem.loop_exit24
; VI-NEXT: v_add_u32_e32 v12, vcc, -11, v12
; VI-NEXT: v_ldexp_f32 v12, v13, v12
; VI-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -14061,7 +14061,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB12_26
-; VI-NEXT: ; %bb.25: ; %frem.else78
+; VI-NEXT: ; %bb.25: ; %frem.else
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_bfi_b32 v11, s2, 0, v3
; VI-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -14070,7 +14070,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB12_32
; VI-NEXT: .LBB12_26:
; VI-NEXT: ; implicit-def: $vgpr11
-; VI-NEXT: .LBB12_27: ; %frem.compute77
+; VI-NEXT: .LBB12_27: ; %frem.compute
; VI-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; VI-NEXT: v_ldexp_f32 v12, v12, 1
; VI-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -14095,10 +14095,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; VI-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; VI-NEXT: s_cbranch_vccnz .LBB12_31
-; VI-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; VI-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v13, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v13, vcc, 12, v13
-; VI-NEXT: .LBB12_29: ; %frem.loop_body85
+; VI-NEXT: .LBB12_29: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v16, v14
; VI-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -14113,7 +14113,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_cbranch_vccnz .LBB12_29
; VI-NEXT: ; %bb.30: ; %Flow
; VI-NEXT: v_mov_b32_e32 v14, v16
-; VI-NEXT: .LBB12_31: ; %frem.loop_exit86
+; VI-NEXT: .LBB12_31: ; %frem.loop_exit
; VI-NEXT: v_add_u32_e32 v13, vcc, -11, v13
; VI-NEXT: v_ldexp_f32 v13, v14, v13
; VI-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -14161,7 +14161,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v0|, |v4|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else78
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v8, s2, 0, v0
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v0|, |v4|
@@ -14170,7 +14170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_8
; GFX9-NEXT: .LBB12_2:
; GFX9-NEXT: ; implicit-def: $vgpr8
-; GFX9-NEXT: .LBB12_3: ; %frem.compute
+; GFX9-NEXT: .LBB12_3: ; %frem.compute77
; GFX9-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX9-NEXT: v_ldexp_f32 v9, v9, 1
; GFX9-NEXT: v_div_scale_f32 v15, s[2:3], v9, v9, 1.0
@@ -14195,10 +14195,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v10
; GFX9-NEXT: v_div_fixup_f32 v12, v12, v9, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX9-NEXT: v_sub_u32_e32 v10, v13, v14
; GFX9-NEXT: v_add_u32_e32 v10, 12, v10
-; GFX9-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v13, v11
; GFX9-NEXT: v_mul_f32_e32 v11, v13, v12
@@ -14213,7 +14213,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_5
; GFX9-NEXT: ; %bb.6: ; %Flow125
; GFX9-NEXT: v_mov_b32_e32 v11, v13
-; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX9-NEXT: v_add_u32_e32 v10, -11, v10
; GFX9-NEXT: v_ldexp_f32 v10, v11, v10
; GFX9-NEXT: v_mul_f32_e32 v11, v10, v12
@@ -14229,7 +14229,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v1|, |v5|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else47
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v9, s2, 0, v1
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v1|, |v5|
@@ -14238,7 +14238,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_16
; GFX9-NEXT: .LBB12_10:
; GFX9-NEXT: ; implicit-def: $vgpr9
-; GFX9-NEXT: .LBB12_11: ; %frem.compute15
+; GFX9-NEXT: .LBB12_11: ; %frem.compute46
; GFX9-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX9-NEXT: v_ldexp_f32 v10, v10, 1
; GFX9-NEXT: v_div_scale_f32 v16, s[2:3], v10, v10, 1.0
@@ -14263,10 +14263,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v11
; GFX9-NEXT: v_div_fixup_f32 v13, v13, v10, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX9-NEXT: v_sub_u32_e32 v11, v14, v15
; GFX9-NEXT: v_add_u32_e32 v11, 12, v11
-; GFX9-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v14, v12
; GFX9-NEXT: v_mul_f32_e32 v12, v14, v13
@@ -14281,7 +14281,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_13
; GFX9-NEXT: ; %bb.14: ; %Flow121
; GFX9-NEXT: v_mov_b32_e32 v12, v14
-; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX9-NEXT: v_add_u32_e32 v11, -11, v11
; GFX9-NEXT: v_ldexp_f32 v11, v12, v11
; GFX9-NEXT: v_mul_f32_e32 v12, v11, v13
@@ -14297,7 +14297,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v2|, |v6|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_18
-; GFX9-NEXT: ; %bb.17: ; %frem.else47
+; GFX9-NEXT: ; %bb.17: ; %frem.else16
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v10, s2, 0, v2
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v2|, |v6|
@@ -14306,7 +14306,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_24
; GFX9-NEXT: .LBB12_18:
; GFX9-NEXT: ; implicit-def: $vgpr10
-; GFX9-NEXT: .LBB12_19: ; %frem.compute46
+; GFX9-NEXT: .LBB12_19: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX9-NEXT: v_ldexp_f32 v11, v11, 1
; GFX9-NEXT: v_div_scale_f32 v17, s[2:3], v11, v11, 1.0
@@ -14331,10 +14331,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v12
; GFX9-NEXT: v_div_fixup_f32 v14, v14, v11, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX9-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX9-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v12, v15, v16
; GFX9-NEXT: v_add_u32_e32 v12, 12, v12
-; GFX9-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX9-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v15, v13
; GFX9-NEXT: v_mul_f32_e32 v13, v15, v14
@@ -14349,7 +14349,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_21
; GFX9-NEXT: ; %bb.22: ; %Flow117
; GFX9-NEXT: v_mov_b32_e32 v13, v15
-; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX9-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX9-NEXT: v_add_u32_e32 v12, -11, v12
; GFX9-NEXT: v_ldexp_f32 v12, v13, v12
; GFX9-NEXT: v_mul_f32_e32 v13, v12, v14
@@ -14365,7 +14365,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f32_e64 s[2:3], |v3|, |v7|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB12_26
-; GFX9-NEXT: ; %bb.25: ; %frem.else78
+; GFX9-NEXT: ; %bb.25: ; %frem.else
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_bfi_b32 v11, s2, 0, v3
; GFX9-NEXT: v_cmp_eq_f32_e64 vcc, |v3|, |v7|
@@ -14374,7 +14374,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB12_32
; GFX9-NEXT: .LBB12_26:
; GFX9-NEXT: ; implicit-def: $vgpr11
-; GFX9-NEXT: .LBB12_27: ; %frem.compute77
+; GFX9-NEXT: .LBB12_27: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX9-NEXT: v_ldexp_f32 v12, v12, 1
; GFX9-NEXT: v_div_scale_f32 v18, s[2:3], v12, v12, 1.0
@@ -14399,10 +14399,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 13, v13
; GFX9-NEXT: v_div_fixup_f32 v15, v15, v12, 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX9-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX9-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v13, v16, v17
; GFX9-NEXT: v_add_u32_e32 v13, 12, v13
-; GFX9-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX9-NEXT: .LBB12_29: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v16, v14
; GFX9-NEXT: v_mul_f32_e32 v14, v16, v15
@@ -14417,7 +14417,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_cbranch_vccnz .LBB12_29
; GFX9-NEXT: ; %bb.30: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v14, v16
-; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX9-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX9-NEXT: v_add_u32_e32 v13, -11, v13
; GFX9-NEXT: v_ldexp_f32 v13, v14, v13
; GFX9-NEXT: v_mul_f32_e32 v14, v13, v15
@@ -14466,7 +14466,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else78
; GFX10-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
; GFX10-NEXT: v_cndmask_b32_e32 v8, v0, v8, vcc_lo
@@ -14474,7 +14474,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_8
; GFX10-NEXT: .LBB12_2:
; GFX10-NEXT: ; implicit-def: $vgpr8
-; GFX10-NEXT: .LBB12_3: ; %frem.compute
+; GFX10-NEXT: .LBB12_3: ; %frem.compute77
; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX10-NEXT: v_frexp_mant_f32_e64 v8, |v0|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v11, v0
@@ -14501,10 +14501,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v12
; GFX10-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v13, v10
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14520,7 +14520,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.6: ; %Flow125
; GFX10-NEXT: v_mov_b32_e32 v12, s2
; GFX10-NEXT: v_mov_b32_e32 v10, v13
-; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX10-NEXT: v_add_nc_u32_e32 v12, -11, v12
; GFX10-NEXT: v_ldexp_f32 v10, v10, v12
; GFX10-NEXT: v_mul_f32_e32 v11, v10, v11
@@ -14535,7 +14535,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else47
; GFX10-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v9, vcc_lo
@@ -14543,7 +14543,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_16
; GFX10-NEXT: .LBB12_10:
; GFX10-NEXT: ; implicit-def: $vgpr9
-; GFX10-NEXT: .LBB12_11: ; %frem.compute15
+; GFX10-NEXT: .LBB12_11: ; %frem.compute46
; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX10-NEXT: v_frexp_mant_f32_e64 v9, |v1|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v12, v1
@@ -14570,10 +14570,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v13
; GFX10-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v14, v11
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14589,7 +14589,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.14: ; %Flow121
; GFX10-NEXT: v_mov_b32_e32 v13, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v14
-; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX10-NEXT: v_add_nc_u32_e32 v13, -11, v13
; GFX10-NEXT: v_ldexp_f32 v11, v11, v13
; GFX10-NEXT: v_mul_f32_e32 v12, v11, v12
@@ -14604,7 +14604,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_18
-; GFX10-NEXT: ; %bb.17: ; %frem.else47
+; GFX10-NEXT: ; %bb.17: ; %frem.else16
; GFX10-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
; GFX10-NEXT: v_cndmask_b32_e32 v10, v2, v10, vcc_lo
@@ -14612,7 +14612,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_24
; GFX10-NEXT: .LBB12_18:
; GFX10-NEXT: ; implicit-def: $vgpr10
-; GFX10-NEXT: .LBB12_19: ; %frem.compute46
+; GFX10-NEXT: .LBB12_19: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX10-NEXT: v_frexp_mant_f32_e64 v10, |v2|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v13, v2
@@ -14639,10 +14639,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v14
; GFX10-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX10-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX10-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX10-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v15, v12
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14658,7 +14658,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.22: ; %Flow117
; GFX10-NEXT: v_mov_b32_e32 v14, s2
; GFX10-NEXT: v_mov_b32_e32 v12, v15
-; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX10-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX10-NEXT: v_add_nc_u32_e32 v14, -11, v14
; GFX10-NEXT: v_ldexp_f32 v12, v12, v14
; GFX10-NEXT: v_mul_f32_e32 v13, v12, v13
@@ -14673,7 +14673,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB12_26
-; GFX10-NEXT: ; %bb.25: ; %frem.else78
+; GFX10-NEXT: ; %bb.25: ; %frem.else
; GFX10-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3
; GFX10-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v11, vcc_lo
@@ -14681,7 +14681,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB12_32
; GFX10-NEXT: .LBB12_26:
; GFX10-NEXT: ; implicit-def: $vgpr11
-; GFX10-NEXT: .LBB12_27: ; %frem.compute77
+; GFX10-NEXT: .LBB12_27: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX10-NEXT: v_frexp_mant_f32_e64 v11, |v3|
; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v14, v3
@@ -14708,10 +14708,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v15
; GFX10-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX10-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX10-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 12
-; GFX10-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX10-NEXT: .LBB12_29: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v16, v13
; GFX10-NEXT: s_add_i32 s2, s2, -12
@@ -14727,7 +14727,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: ; %bb.30: ; %Flow
; GFX10-NEXT: v_mov_b32_e32 v15, s2
; GFX10-NEXT: v_mov_b32_e32 v13, v16
-; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX10-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX10-NEXT: v_add_nc_u32_e32 v15, -11, v15
; GFX10-NEXT: v_ldexp_f32 v13, v13, v15
; GFX10-NEXT: v_mul_f32_e32 v14, v13, v14
@@ -14773,7 +14773,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v0|, |v4|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else78
; GFX11-NEXT: v_bfi_b32 v8, 0x7fffffff, 0, v0
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14782,7 +14782,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_8
; GFX11-NEXT: .LBB12_2:
; GFX11-NEXT: ; implicit-def: $vgpr8
-; GFX11-NEXT: .LBB12_3: ; %frem.compute
+; GFX11-NEXT: .LBB12_3: ; %frem.compute77
; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v4|
; GFX11-NEXT: v_frexp_mant_f32_e64 v8, |v0|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v11, v0
@@ -14818,11 +14818,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v11, v11, v9, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v13, v10
@@ -14842,7 +14842,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow125
; GFX11-NEXT: v_mov_b32_e32 v12, s2
; GFX11-NEXT: v_mov_b32_e32 v10, v13
-; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v12, -11, v12
; GFX11-NEXT: v_ldexp_f32 v10, v10, v12
@@ -14862,7 +14862,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v1|, |v5|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else47
; GFX11-NEXT: v_bfi_b32 v9, 0x7fffffff, 0, v1
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14871,7 +14871,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_16
; GFX11-NEXT: .LBB12_10:
; GFX11-NEXT: ; implicit-def: $vgpr9
-; GFX11-NEXT: .LBB12_11: ; %frem.compute15
+; GFX11-NEXT: .LBB12_11: ; %frem.compute46
; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v5|
; GFX11-NEXT: v_frexp_mant_f32_e64 v9, |v1|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v12, v1
@@ -14907,11 +14907,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v12, v12, v10, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v14, v11
@@ -14931,7 +14931,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow121
; GFX11-NEXT: v_mov_b32_e32 v13, s2
; GFX11-NEXT: v_mov_b32_e32 v11, v14
-; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v13, -11, v13
; GFX11-NEXT: v_ldexp_f32 v11, v11, v13
@@ -14951,7 +14951,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v2|, |v6|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_18
-; GFX11-NEXT: ; %bb.17: ; %frem.else47
+; GFX11-NEXT: ; %bb.17: ; %frem.else16
; GFX11-NEXT: v_bfi_b32 v10, 0x7fffffff, 0, v2
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -14960,7 +14960,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_24
; GFX11-NEXT: .LBB12_18:
; GFX11-NEXT: ; implicit-def: $vgpr10
-; GFX11-NEXT: .LBB12_19: ; %frem.compute46
+; GFX11-NEXT: .LBB12_19: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v6|
; GFX11-NEXT: v_frexp_mant_f32_e64 v10, |v2|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v13, v2
@@ -14996,11 +14996,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v13, v13, v11, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX11-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX11-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX11-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v15, v12
@@ -15020,7 +15020,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.22: ; %Flow117
; GFX11-NEXT: v_mov_b32_e32 v14, s2
; GFX11-NEXT: v_mov_b32_e32 v12, v15
-; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX11-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v14, -11, v14
; GFX11-NEXT: v_ldexp_f32 v12, v12, v14
@@ -15040,7 +15040,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f32_e64 s2, |v3|, |v7|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB12_26
-; GFX11-NEXT: ; %bb.25: ; %frem.else78
+; GFX11-NEXT: ; %bb.25: ; %frem.else
; GFX11-NEXT: v_bfi_b32 v11, 0x7fffffff, 0, v3
; GFX11-NEXT: v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
@@ -15049,7 +15049,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB12_32
; GFX11-NEXT: .LBB12_26:
; GFX11-NEXT: ; implicit-def: $vgpr11
-; GFX11-NEXT: .LBB12_27: ; %frem.compute77
+; GFX11-NEXT: .LBB12_27: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f32_e64 v12, |v7|
; GFX11-NEXT: v_frexp_mant_f32_e64 v11, |v3|
; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v14, v3
@@ -15085,11 +15085,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f32 v14, v14, v12, 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX11-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX11-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 12
-; GFX11-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX11-NEXT: .LBB12_29: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_mov_b32_e32 v16, v13
@@ -15109,7 +15109,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.30: ; %Flow
; GFX11-NEXT: v_mov_b32_e32 v15, s2
; GFX11-NEXT: v_mov_b32_e32 v13, v16
-; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX11-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_nc_u32_e32 v15, -11, v15
; GFX11-NEXT: v_ldexp_f32 v13, v13, v15
@@ -15170,7 +15170,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s5, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else78
; GFX1150-NEXT: s_cmp_eq_f32 s5, s12
; GFX1150-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15180,7 +15180,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_8
; GFX1150-NEXT: .LBB12_2:
; GFX1150-NEXT: ; implicit-def: $vgpr0
-; GFX1150-NEXT: .LBB12_3: ; %frem.compute
+; GFX1150-NEXT: .LBB12_3: ; %frem.compute77
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s6|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v0, |s8|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -15215,11 +15215,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1150-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v5, v2
@@ -15241,7 +15241,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow125
; GFX1150-NEXT: v_mov_b32_e32 v4, s11
; GFX1150-NEXT: v_mov_b32_e32 v2, v5
-; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1150-NEXT: v_ldexp_f32 v2, v2, v4
@@ -15264,7 +15264,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s8, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else47
; GFX1150-NEXT: s_cmp_eq_f32 s8, s12
; GFX1150-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15274,7 +15274,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_16
; GFX1150-NEXT: .LBB12_10:
; GFX1150-NEXT: ; implicit-def: $vgpr1
-; GFX1150-NEXT: .LBB12_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB12_11: ; %frem.compute46
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s4|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v1, |s10|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -15309,11 +15309,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1150-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v6, v3
@@ -15335,7 +15335,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow121
; GFX1150-NEXT: v_mov_b32_e32 v5, s11
; GFX1150-NEXT: v_mov_b32_e32 v3, v6
-; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1150-NEXT: v_ldexp_f32 v3, v3, v5
@@ -15358,7 +15358,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s10, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_18
-; GFX1150-NEXT: ; %bb.17: ; %frem.else47
+; GFX1150-NEXT: ; %bb.17: ; %frem.else16
; GFX1150-NEXT: s_cmp_eq_f32 s10, s12
; GFX1150-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15368,7 +15368,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_24
; GFX1150-NEXT: .LBB12_18:
; GFX1150-NEXT: ; implicit-def: $vgpr2
-; GFX1150-NEXT: .LBB12_19: ; %frem.compute46
+; GFX1150-NEXT: .LBB12_19: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s3|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v5, s9
@@ -15403,11 +15403,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6
; GFX1150-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX1150-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX1150-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v7, v4
@@ -15429,7 +15429,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.22: ; %Flow117
; GFX1150-NEXT: v_mov_b32_e32 v6, s11
; GFX1150-NEXT: v_mov_b32_e32 v4, v7
-; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX1150-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v6, -11, v6
; GFX1150-NEXT: v_ldexp_f32 v4, v4, v6
@@ -15452,7 +15452,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_cmp_ngt_f32 s9, s12
; GFX1150-NEXT: s_cbranch_scc0 .LBB12_26
-; GFX1150-NEXT: ; %bb.25: ; %frem.else78
+; GFX1150-NEXT: ; %bb.25: ; %frem.else
; GFX1150-NEXT: s_cmp_eq_f32 s9, s12
; GFX1150-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7
; GFX1150-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15462,7 +15462,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB12_32
; GFX1150-NEXT: .LBB12_26:
; GFX1150-NEXT: ; implicit-def: $vgpr3
-; GFX1150-NEXT: .LBB12_27: ; %frem.compute77
+; GFX1150-NEXT: .LBB12_27: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f32_e64 v4, |s2|
; GFX1150-NEXT: v_frexp_mant_f32_e64 v3, |s7|
; GFX1150-NEXT: v_frexp_exp_i32_f32_e32 v6, s7
@@ -15497,11 +15497,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7
; GFX1150-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX1150-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s11, s11, s12
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s11, s11, 12
-; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX1150-NEXT: .LBB12_29: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_mov_b32_e32 v8, v5
@@ -15523,7 +15523,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.30: ; %Flow
; GFX1150-NEXT: v_mov_b32_e32 v7, s11
; GFX1150-NEXT: v_mov_b32_e32 v5, v8
-; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX1150-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_add_nc_u32_e32 v7, -11, v7
; GFX1150-NEXT: v_ldexp_f32 v5, v5, v7
@@ -15597,7 +15597,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_cmp_ngt_f32 s5, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else78
; GFX1200-NEXT: s_cmp_eq_f32 s5, s12
; GFX1200-NEXT: v_bfi_b32 v0, 0x7fffffff, 0, s8
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15607,7 +15607,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_8
; GFX1200-NEXT: .LBB12_2:
; GFX1200-NEXT: ; implicit-def: $vgpr0
-; GFX1200-NEXT: .LBB12_3: ; %frem.compute
+; GFX1200-NEXT: .LBB12_3: ; %frem.compute77
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s6|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v0, |s8|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v3, s8
@@ -15643,11 +15643,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v4
; GFX1200-NEXT: v_div_fixup_f32 v3, v3, v1, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body85.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB12_5: ; %frem.loop_body85
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1200-NEXT: v_mov_b32_e32 v5, v2
@@ -15670,7 +15670,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow125
; GFX1200-NEXT: v_mov_b32_e32 v4, s11
; GFX1200-NEXT: v_mov_b32_e32 v2, v5
-; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB12_7: ; %frem.loop_exit86
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v4, -11, v4
; GFX1200-NEXT: v_ldexp_f32 v2, v2, v4
@@ -15694,7 +15694,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s8, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else47
; GFX1200-NEXT: s_cmp_eq_f32 s8, s12
; GFX1200-NEXT: v_bfi_b32 v1, 0x7fffffff, 0, s10
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15705,7 +15705,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_16
; GFX1200-NEXT: .LBB12_10:
; GFX1200-NEXT: ; implicit-def: $vgpr1
-; GFX1200-NEXT: .LBB12_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB12_11: ; %frem.compute46
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s4|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v1, |s10|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v4, s10
@@ -15741,11 +15741,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v5
; GFX1200-NEXT: v_div_fixup_f32 v4, v4, v2, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body54.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB12_13: ; %frem.loop_body54
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v6, v3
@@ -15769,7 +15769,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow121
; GFX1200-NEXT: v_mov_b32_e32 v5, s11
; GFX1200-NEXT: v_mov_b32_e32 v3, v6
-; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB12_15: ; %frem.loop_exit55
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v5, -11, v5
; GFX1200-NEXT: v_ldexp_f32 v3, v3, v5
@@ -15793,7 +15793,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s10, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_18
-; GFX1200-NEXT: ; %bb.17: ; %frem.else47
+; GFX1200-NEXT: ; %bb.17: ; %frem.else16
; GFX1200-NEXT: s_cmp_eq_f32 s10, s12
; GFX1200-NEXT: v_bfi_b32 v2, 0x7fffffff, 0, s9
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15804,7 +15804,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_24
; GFX1200-NEXT: .LBB12_18:
; GFX1200-NEXT: ; implicit-def: $vgpr2
-; GFX1200-NEXT: .LBB12_19: ; %frem.compute46
+; GFX1200-NEXT: .LBB12_19: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s3|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v2, |s9|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v5, s9
@@ -15840,11 +15840,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v6
; GFX1200-NEXT: v_div_fixup_f32 v5, v5, v3, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_23
-; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body54.preheader
+; GFX1200-NEXT: ; %bb.20: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body54
+; GFX1200-NEXT: .LBB12_21: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v7, v4
@@ -15868,7 +15868,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.22: ; %Flow117
; GFX1200-NEXT: v_mov_b32_e32 v6, s11
; GFX1200-NEXT: v_mov_b32_e32 v4, v7
-; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit55
+; GFX1200-NEXT: .LBB12_23: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v6, -11, v6
; GFX1200-NEXT: v_ldexp_f32 v4, v4, v6
@@ -15892,7 +15892,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cmp_ngt_f32 s9, s12
; GFX1200-NEXT: s_cbranch_scc0 .LBB12_26
-; GFX1200-NEXT: ; %bb.25: ; %frem.else78
+; GFX1200-NEXT: ; %bb.25: ; %frem.else
; GFX1200-NEXT: s_cmp_eq_f32 s9, s12
; GFX1200-NEXT: v_bfi_b32 v3, 0x7fffffff, 0, s7
; GFX1200-NEXT: s_cselect_b32 vcc_lo, -1, 0
@@ -15903,7 +15903,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB12_32
; GFX1200-NEXT: .LBB12_26:
; GFX1200-NEXT: ; implicit-def: $vgpr3
-; GFX1200-NEXT: .LBB12_27: ; %frem.compute77
+; GFX1200-NEXT: .LBB12_27: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f32_e64 v4, |s2|
; GFX1200-NEXT: v_frexp_mant_f32_e64 v3, |s7|
; GFX1200-NEXT: v_frexp_exp_i32_f32_e32 v6, s7
@@ -15939,11 +15939,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 13, v7
; GFX1200-NEXT: v_div_fixup_f32 v6, v6, v4, 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB12_31
-; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body85.preheader
+; GFX1200-NEXT: ; %bb.28: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s11, s11, s12
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body85
+; GFX1200-NEXT: .LBB12_29: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1200-NEXT: v_mov_b32_e32 v8, v5
@@ -15967,7 +15967,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.30: ; %Flow
; GFX1200-NEXT: v_mov_b32_e32 v7, s11
; GFX1200-NEXT: v_mov_b32_e32 v5, v8
-; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit86
+; GFX1200-NEXT: .LBB12_31: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_add_nc_u32_e32 v7, -11, v7
; GFX1200-NEXT: v_ldexp_f32 v5, v5, v7
@@ -16048,7 +16048,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]|
; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
; SI-NEXT: s_cbranch_vccz .LBB13_2
-; SI-NEXT: ; %bb.1: ; %frem.else
+; SI-NEXT: ; %bb.1: ; %frem.else16
; SI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; SI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16059,7 +16059,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB13_2:
; SI-NEXT: ; implicit-def: $vgpr8_vgpr9
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB13_3: ; %frem.compute
+; SI-NEXT: .LBB13_3: ; %frem.compute15
; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: v_and_b32_e32 v10, 0x7fffffff, v1
; SI-NEXT: s_mov_b32 s0, 0
@@ -16105,13 +16105,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; SI-NEXT: s_cmp_lt_i32 s6, 27
; SI-NEXT: s_cbranch_scc1 .LBB13_7
-; SI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; SI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; SI-NEXT: s_sub_i32 s0, s3, s7
; SI-NEXT: s_add_i32 s6, s0, 26
; SI-NEXT: s_mov_b32 s3, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v18, 0x43300000
; SI-NEXT: v_mov_b32_e32 v14, 0
-; SI-NEXT: .LBB13_5: ; %frem.loop_body
+; SI-NEXT: .LBB13_5: ; %frem.loop_body23
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v17, v11
; SI-NEXT: v_mov_b32_e32 v16, v10
@@ -16134,7 +16134,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: ; %bb.6: ; %Flow51
; SI-NEXT: v_mov_b32_e32 v10, v16
; SI-NEXT: v_mov_b32_e32 v11, v17
-; SI-NEXT: .LBB13_7: ; %frem.loop_exit
+; SI-NEXT: .LBB13_7: ; %frem.loop_exit24
; SI-NEXT: s_sub_i32 s0, s6, 25
; SI-NEXT: v_ldexp_f64 v[10:11], v[10:11], s0
; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13]
@@ -16160,7 +16160,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_cmp_ngt_f64_e64 s[0:1], |v[2:3]|, |v[6:7]|
; SI-NEXT: s_and_b64 vcc, exec, s[0:1]
; SI-NEXT: s_cbranch_vccz .LBB13_10
-; SI-NEXT: ; %bb.9: ; %frem.else16
+; SI-NEXT: ; %bb.9: ; %frem.else
; SI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; SI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; SI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16171,7 +16171,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: .LBB13_10:
; SI-NEXT: ; implicit-def: $vgpr10_vgpr11
; SI-NEXT: s_mov_b64 vcc, 0
-; SI-NEXT: .LBB13_11: ; %frem.compute15
+; SI-NEXT: .LBB13_11: ; %frem.compute
; SI-NEXT: s_brev_b32 s5, -2
; SI-NEXT: v_and_b32_e32 v12, 0x7fffffff, v3
; SI-NEXT: s_mov_b32 s0, 0
@@ -16217,13 +16217,13 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; SI-NEXT: s_cmp_lt_i32 s6, 27
; SI-NEXT: s_cbranch_scc1 .LBB13_15
-; SI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; SI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; SI-NEXT: s_sub_i32 s0, s3, s7
; SI-NEXT: s_add_i32 s6, s0, 26
; SI-NEXT: s_mov_b32 s3, 0x432fffff
; SI-NEXT: v_mov_b32_e32 v20, 0x43300000
; SI-NEXT: v_mov_b32_e32 v16, 0
-; SI-NEXT: .LBB13_13: ; %frem.loop_body23
+; SI-NEXT: .LBB13_13: ; %frem.loop_body
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: v_mov_b32_e32 v19, v13
; SI-NEXT: v_mov_b32_e32 v18, v12
@@ -16246,7 +16246,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; SI-NEXT: ; %bb.14: ; %Flow
; SI-NEXT: v_mov_b32_e32 v12, v18
; SI-NEXT: v_mov_b32_e32 v13, v19
-; SI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; SI-NEXT: .LBB13_15: ; %frem.loop_exit
; SI-NEXT: s_sub_i32 s0, s6, 25
; SI-NEXT: v_ldexp_f64 v[12:13], v[12:13], s0
; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15]
@@ -16304,7 +16304,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB13_2
-; CI-NEXT: ; %bb.1: ; %frem.else
+; CI-NEXT: ; %bb.1: ; %frem.else16
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; CI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; CI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16313,7 +16313,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB13_8
; CI-NEXT: .LBB13_2:
; CI-NEXT: ; implicit-def: $vgpr8_vgpr9
-; CI-NEXT: .LBB13_3: ; %frem.compute
+; CI-NEXT: .LBB13_3: ; %frem.compute15
; CI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; CI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; CI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16337,10 +16337,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; CI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_7
-; CI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; CI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; CI-NEXT: v_sub_i32_e32 v14, vcc, v14, v15
; CI-NEXT: v_add_i32_e32 v17, vcc, 26, v14
-; CI-NEXT: .LBB13_5: ; %frem.loop_body
+; CI-NEXT: .LBB13_5: ; %frem.loop_body23
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v15, v11
; CI-NEXT: v_mov_b32_e32 v14, v10
@@ -16358,7 +16358,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; %bb.6: ; %Flow51
; CI-NEXT: v_mov_b32_e32 v10, v14
; CI-NEXT: v_mov_b32_e32 v11, v15
-; CI-NEXT: .LBB13_7: ; %frem.loop_exit
+; CI-NEXT: .LBB13_7: ; %frem.loop_exit24
; CI-NEXT: v_subrev_i32_e32 v14, vcc, 25, v17
; CI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; CI-NEXT: s_brev_b32 s2, -2
@@ -16375,7 +16375,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; CI-NEXT: s_and_b64 vcc, exec, s[2:3]
; CI-NEXT: s_cbranch_vccz .LBB13_10
-; CI-NEXT: ; %bb.9: ; %frem.else16
+; CI-NEXT: ; %bb.9: ; %frem.else
; CI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; CI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; CI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16384,7 +16384,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: s_branch .LBB13_16
; CI-NEXT: .LBB13_10:
; CI-NEXT: ; implicit-def: $vgpr10_vgpr11
-; CI-NEXT: .LBB13_11: ; %frem.compute15
+; CI-NEXT: .LBB13_11: ; %frem.compute
; CI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; CI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; CI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16408,10 +16408,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; CI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; CI-NEXT: s_cbranch_vccnz .LBB13_15
-; CI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; CI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; CI-NEXT: v_sub_i32_e32 v16, vcc, v16, v17
; CI-NEXT: v_add_i32_e32 v19, vcc, 26, v16
-; CI-NEXT: .LBB13_13: ; %frem.loop_body23
+; CI-NEXT: .LBB13_13: ; %frem.loop_body
; CI-NEXT: ; =>This Inner Loop Header: Depth=1
; CI-NEXT: v_mov_b32_e32 v17, v13
; CI-NEXT: v_mov_b32_e32 v16, v12
@@ -16429,7 +16429,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: ; %bb.14: ; %Flow
; CI-NEXT: v_mov_b32_e32 v12, v16
; CI-NEXT: v_mov_b32_e32 v13, v17
-; CI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; CI-NEXT: .LBB13_15: ; %frem.loop_exit
; CI-NEXT: v_subrev_i32_e32 v16, vcc, 25, v19
; CI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; CI-NEXT: s_brev_b32 s2, -2
@@ -16478,7 +16478,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB13_2
-; VI-NEXT: ; %bb.1: ; %frem.else
+; VI-NEXT: ; %bb.1: ; %frem.else16
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; VI-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; VI-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16487,7 +16487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB13_8
; VI-NEXT: .LBB13_2:
; VI-NEXT: ; implicit-def: $vgpr8_vgpr9
-; VI-NEXT: .LBB13_3: ; %frem.compute
+; VI-NEXT: .LBB13_3: ; %frem.compute15
; VI-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; VI-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; VI-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16511,10 +16511,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; VI-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_7
-; VI-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; VI-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; VI-NEXT: v_sub_u32_e32 v14, vcc, v14, v15
; VI-NEXT: v_add_u32_e32 v17, vcc, 26, v14
-; VI-NEXT: .LBB13_5: ; %frem.loop_body
+; VI-NEXT: .LBB13_5: ; %frem.loop_body23
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v15, v11
; VI-NEXT: v_mov_b32_e32 v14, v10
@@ -16532,7 +16532,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; %bb.6: ; %Flow51
; VI-NEXT: v_mov_b32_e32 v10, v14
; VI-NEXT: v_mov_b32_e32 v11, v15
-; VI-NEXT: .LBB13_7: ; %frem.loop_exit
+; VI-NEXT: .LBB13_7: ; %frem.loop_exit24
; VI-NEXT: v_subrev_u32_e32 v14, vcc, 25, v17
; VI-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; VI-NEXT: s_brev_b32 s2, -2
@@ -16549,7 +16549,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; VI-NEXT: s_and_b64 vcc, exec, s[2:3]
; VI-NEXT: s_cbranch_vccz .LBB13_10
-; VI-NEXT: ; %bb.9: ; %frem.else16
+; VI-NEXT: ; %bb.9: ; %frem.else
; VI-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; VI-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; VI-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16558,7 +16558,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_branch .LBB13_16
; VI-NEXT: .LBB13_10:
; VI-NEXT: ; implicit-def: $vgpr10_vgpr11
-; VI-NEXT: .LBB13_11: ; %frem.compute15
+; VI-NEXT: .LBB13_11: ; %frem.compute
; VI-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; VI-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; VI-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16582,10 +16582,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; VI-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; VI-NEXT: s_cbranch_vccnz .LBB13_15
-; VI-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; VI-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; VI-NEXT: v_sub_u32_e32 v16, vcc, v16, v17
; VI-NEXT: v_add_u32_e32 v19, vcc, 26, v16
-; VI-NEXT: .LBB13_13: ; %frem.loop_body23
+; VI-NEXT: .LBB13_13: ; %frem.loop_body
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: v_mov_b32_e32 v17, v13
; VI-NEXT: v_mov_b32_e32 v16, v12
@@ -16603,7 +16603,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: ; %bb.14: ; %Flow
; VI-NEXT: v_mov_b32_e32 v12, v16
; VI-NEXT: v_mov_b32_e32 v13, v17
-; VI-NEXT: .LBB13_15: ; %frem.loop_exit24
+; VI-NEXT: .LBB13_15: ; %frem.loop_exit
; VI-NEXT: v_subrev_u32_e32 v16, vcc, 25, v19
; VI-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; VI-NEXT: s_brev_b32 s2, -2
@@ -16647,7 +16647,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB13_2
-; GFX9-NEXT: ; %bb.1: ; %frem.else
+; GFX9-NEXT: ; %bb.1: ; %frem.else16
; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[0:1]|, |v[4:5]|
; GFX9-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX9-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc
@@ -16656,7 +16656,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB13_8
; GFX9-NEXT: .LBB13_2:
; GFX9-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX9-NEXT: .LBB13_3: ; %frem.compute
+; GFX9-NEXT: .LBB13_3: ; %frem.compute15
; GFX9-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v15, v[4:5]
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v14, v[0:1]
@@ -16680,10 +16680,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v17
; GFX9-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX9-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX9-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX9-NEXT: v_sub_u32_e32 v14, v14, v15
; GFX9-NEXT: v_add_u32_e32 v17, 26, v14
-; GFX9-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX9-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v15, v11
; GFX9-NEXT: v_mov_b32_e32 v14, v10
@@ -16701,7 +16701,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: ; %bb.6: ; %Flow51
; GFX9-NEXT: v_mov_b32_e32 v10, v14
; GFX9-NEXT: v_mov_b32_e32 v11, v15
-; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX9-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX9-NEXT: v_subrev_u32_e32 v14, 25, v17
; GFX9-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; GFX9-NEXT: s_brev_b32 s2, -2
@@ -16718,7 +16718,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_ngt_f64_e64 s[2:3], |v[2:3]|, |v[6:7]|
; GFX9-NEXT: s_and_b64 vcc, exec, s[2:3]
; GFX9-NEXT: s_cbranch_vccz .LBB13_10
-; GFX9-NEXT: ; %bb.9: ; %frem.else16
+; GFX9-NEXT: ; %bb.9: ; %frem.else
; GFX9-NEXT: v_cmp_eq_f64_e64 vcc, |v[2:3]|, |v[6:7]|
; GFX9-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc
@@ -16727,7 +16727,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_branch .LBB13_16
; GFX9-NEXT: .LBB13_10:
; GFX9-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX9-NEXT: .LBB13_11: ; %frem.compute15
+; GFX9-NEXT: .LBB13_11: ; %frem.compute
; GFX9-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v17, v[6:7]
; GFX9-NEXT: v_frexp_exp_i32_f64_e32 v16, v[2:3]
@@ -16751,10 +16751,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 27, v19
; GFX9-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX9-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX9-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX9-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX9-NEXT: v_sub_u32_e32 v16, v16, v17
; GFX9-NEXT: v_add_u32_e32 v19, 26, v16
-; GFX9-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX9-NEXT: .LBB13_13: ; %frem.loop_body
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_mov_b32_e32 v17, v13
; GFX9-NEXT: v_mov_b32_e32 v16, v12
@@ -16772,7 +16772,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: ; %bb.14: ; %Flow
; GFX9-NEXT: v_mov_b32_e32 v12, v16
; GFX9-NEXT: v_mov_b32_e32 v13, v17
-; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX9-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX9-NEXT: v_subrev_u32_e32 v16, 25, v19
; GFX9-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; GFX9-NEXT: s_brev_b32 s2, -2
@@ -16817,7 +16817,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB13_2
-; GFX10-NEXT: ; %bb.1: ; %frem.else
+; GFX10-NEXT: ; %bb.1: ; %frem.else16
; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX10-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX10-NEXT: v_cndmask_b32_e32 v9, v1, v8, vcc_lo
@@ -16826,7 +16826,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB13_8
; GFX10-NEXT: .LBB13_2:
; GFX10-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX10-NEXT: .LBB13_3: ; %frem.compute
+; GFX10-NEXT: .LBB13_3: ; %frem.compute15
; GFX10-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -16851,10 +16851,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX10-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX10-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX10-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 26
-; GFX10-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX10-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v15, v11
; GFX10-NEXT: v_mov_b32_e32 v14, v10
@@ -16873,7 +16873,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_mov_b32_e32 v10, v14
; GFX10-NEXT: v_mov_b32_e32 v17, s2
; GFX10-NEXT: v_mov_b32_e32 v11, v15
-; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX10-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX10-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
; GFX10-NEXT: v_mul_f64 v[12:13], v[10:11], v[12:13]
@@ -16889,7 +16889,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX10-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX10-NEXT: s_cbranch_vccz .LBB13_10
-; GFX10-NEXT: ; %bb.9: ; %frem.else16
+; GFX10-NEXT: ; %bb.9: ; %frem.else
; GFX10-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX10-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX10-NEXT: v_cndmask_b32_e32 v11, v3, v10, vcc_lo
@@ -16898,7 +16898,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_branch .LBB13_16
; GFX10-NEXT: .LBB13_10:
; GFX10-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX10-NEXT: .LBB13_11: ; %frem.compute15
+; GFX10-NEXT: .LBB13_11: ; %frem.compute
; GFX10-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX10-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -16923,10 +16923,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX10-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX10-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX10-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX10-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX10-NEXT: s_sub_i32 s2, s2, s3
; GFX10-NEXT: s_add_i32 s2, s2, 26
-; GFX10-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX10-NEXT: .LBB13_13: ; %frem.loop_body
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_mov_b32_e32 v17, v13
; GFX10-NEXT: v_mov_b32_e32 v16, v12
@@ -16945,7 +16945,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: v_mov_b32_e32 v12, v16
; GFX10-NEXT: v_mov_b32_e32 v19, s2
; GFX10-NEXT: v_mov_b32_e32 v13, v17
-; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX10-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX10-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
; GFX10-NEXT: v_mul_f64 v[14:15], v[12:13], v[14:15]
@@ -16986,7 +16986,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB13_2
-; GFX11-NEXT: ; %bb.1: ; %frem.else
+; GFX11-NEXT: ; %bb.1: ; %frem.else16
; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX11-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -16996,7 +16996,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB13_8
; GFX11-NEXT: .LBB13_2:
; GFX11-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX11-NEXT: .LBB13_3: ; %frem.compute
+; GFX11-NEXT: .LBB13_3: ; %frem.compute15
; GFX11-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17029,12 +17029,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX11-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX11-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 26
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX11-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17054,7 +17054,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.6: ; %Flow51
; GFX11-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX11-NEXT: v_mov_b32_e32 v11, v15
-; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX11-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX11-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17074,7 +17074,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX11-NEXT: s_cbranch_vccz .LBB13_10
-; GFX11-NEXT: ; %bb.9: ; %frem.else16
+; GFX11-NEXT: ; %bb.9: ; %frem.else
; GFX11-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX11-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17084,7 +17084,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_branch .LBB13_16
; GFX11-NEXT: .LBB13_10:
; GFX11-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX11-NEXT: .LBB13_11: ; %frem.compute15
+; GFX11-NEXT: .LBB13_11: ; %frem.compute
; GFX11-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX11-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17117,12 +17117,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX11-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX11-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX11-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX11-NEXT: s_sub_i32 s2, s2, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, 26
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX11-NEXT: .LBB13_13: ; %frem.loop_body
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17142,7 +17142,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: ; %bb.14: ; %Flow
; GFX11-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX11-NEXT: v_mov_b32_e32 v13, v17
-; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX11-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX11-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
@@ -17187,7 +17187,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1150-NEXT: s_cbranch_vccz .LBB13_2
-; GFX1150-NEXT: ; %bb.1: ; %frem.else
+; GFX1150-NEXT: ; %bb.1: ; %frem.else16
; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX1150-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17197,7 +17197,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB13_8
; GFX1150-NEXT: .LBB13_2:
; GFX1150-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1150-NEXT: .LBB13_3: ; %frem.compute
+; GFX1150-NEXT: .LBB13_3: ; %frem.compute15
; GFX1150-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17229,12 +17229,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX1150-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1150-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1150-NEXT: s_sub_i32 s2, s2, s3
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s2, s2, 26
; GFX1150-NEXT: .p2align 6
-; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX1150-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17254,7 +17254,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.6: ; %Flow51
; GFX1150-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX1150-NEXT: v_mov_b32_e32 v11, v15
-; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX1150-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX1150-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17274,7 +17274,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_ngt_f64_e64 s2, |v[2:3]|, |v[6:7]|
; GFX1150-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1150-NEXT: s_cbranch_vccz .LBB13_10
-; GFX1150-NEXT: ; %bb.9: ; %frem.else16
+; GFX1150-NEXT: ; %bb.9: ; %frem.else
; GFX1150-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX1150-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17284,7 +17284,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: s_branch .LBB13_16
; GFX1150-NEXT: .LBB13_10:
; GFX1150-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1150-NEXT: .LBB13_11: ; %frem.compute15
+; GFX1150-NEXT: .LBB13_11: ; %frem.compute
; GFX1150-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX1150-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17316,12 +17316,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX1150-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX1150-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1150-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1150-NEXT: s_sub_i32 s2, s2, s3
; GFX1150-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1150-NEXT: s_add_i32 s2, s2, 26
; GFX1150-NEXT: .p2align 6
-; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX1150-NEXT: .LBB13_13: ; %frem.loop_body
; GFX1150-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1150-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17341,7 +17341,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: ; %bb.14: ; %Flow
; GFX1150-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX1150-NEXT: v_mov_b32_e32 v13, v17
-; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX1150-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX1150-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
@@ -17386,7 +17386,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_ngt_f64_e64 s2, |v[0:1]|, |v[4:5]|
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1200-NEXT: s_cbranch_vccz .LBB13_2
-; GFX1200-NEXT: ; %bb.1: ; %frem.else
+; GFX1200-NEXT: ; %bb.1: ; %frem.else16
; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[0:1]|, |v[4:5]|
; GFX1200-NEXT: v_and_b32_e32 v8, 0x80000000, v1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -17396,7 +17396,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB13_8
; GFX1200-NEXT: .LBB13_2:
; GFX1200-NEXT: ; implicit-def: $vgpr8_vgpr9
-; GFX1200-NEXT: .LBB13_3: ; %frem.compute
+; GFX1200-NEXT: .LBB13_3: ; %frem.compute15
; GFX1200-NEXT: v_frexp_mant_f64_e64 v[8:9], |v[0:1]|
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v13, v[4:5]
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v12, v[0:1]
@@ -17429,11 +17429,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v17
; GFX1200-NEXT: v_div_fixup_f64 v[12:13], v[12:13], v[8:9], 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB13_7
-; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body.preheader
+; GFX1200-NEXT: ; %bb.4: ; %frem.loop_body23.preheader
; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3
; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX1200-NEXT: s_add_co_i32 s2, s2, 26
-; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body
+; GFX1200-NEXT: .LBB13_5: ; %frem.loop_body23
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX1200-NEXT: v_dual_mov_b32 v15, v11 :: v_dual_mov_b32 v14, v10
@@ -17454,7 +17454,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.6: ; %Flow51
; GFX1200-NEXT: v_dual_mov_b32 v17, s2 :: v_dual_mov_b32 v10, v14
; GFX1200-NEXT: v_mov_b32_e32 v11, v15
-; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit
+; GFX1200-NEXT: .LBB13_7: ; %frem.loop_exit24
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_subrev_nc_u32_e32 v14, 25, v17
; GFX1200-NEXT: v_ldexp_f64 v[10:11], v[10:11], v14
@@ -17476,7 +17476,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s2
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_cbranch_vccz .LBB13_10
-; GFX1200-NEXT: ; %bb.9: ; %frem.else16
+; GFX1200-NEXT: ; %bb.9: ; %frem.else
; GFX1200-NEXT: v_cmp_eq_f64_e64 vcc_lo, |v[2:3]|, |v[6:7]|
; GFX1200-NEXT: v_and_b32_e32 v10, 0x80000000, v3
; GFX1200-NEXT: s_wait_alu 0xfffd
@@ -17487,7 +17487,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: s_branch .LBB13_16
; GFX1200-NEXT: .LBB13_10:
; GFX1200-NEXT: ; implicit-def: $vgpr10_vgpr11
-; GFX1200-NEXT: .LBB13_11: ; %frem.compute15
+; GFX1200-NEXT: .LBB13_11: ; %frem.compute
; GFX1200-NEXT: v_frexp_mant_f64_e64 v[10:11], |v[2:3]|
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v15, v[6:7]
; GFX1200-NEXT: v_frexp_exp_i32_f64_e32 v14, v[2:3]
@@ -17520,11 +17520,11 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: v_cmp_gt_i32_e32 vcc_lo, 27, v19
; GFX1200-NEXT: v_div_fixup_f64 v[14:15], v[14:15], v[10:11], 1.0
; GFX1200-NEXT: s_cbranch_vccnz .LBB13_15
-; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body23.preheader
+; GFX1200-NEXT: ; %bb.12: ; %frem.loop_body.preheader
; GFX1200-NEXT: s_sub_co_i32 s2, s2, s3
; GFX1200-NEXT: s_wait_alu 0xfffe
; GFX1200-NEXT: s_add_co_i32 s2, s2, 26
-; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body23
+; GFX1200-NEXT: .LBB13_13: ; %frem.loop_body
; GFX1200-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX1200-NEXT: v_dual_mov_b32 v17, v13 :: v_dual_mov_b32 v16, v12
@@ -17547,7 +17547,7 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1200-NEXT: ; %bb.14: ; %Flow
; GFX1200-NEXT: v_dual_mov_b32 v19, s2 :: v_dual_mov_b32 v12, v16
; GFX1200-NEXT: v_mov_b32_e32 v13, v17
-; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit24
+; GFX1200-NEXT: .LBB13_15: ; %frem.loop_exit
; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1200-NEXT: v_subrev_nc_u32_e32 v16, 25, v19
; GFX1200-NEXT: v_ldexp_f64 v[12:13], v[12:13], v16
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 3c41cc4..5babe9f 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1111,15 +1111,11 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1190,18 +1186,15 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 {
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
@@ -1281,28 +1274,22 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v8i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v6.l, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.l, v1.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6
-; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6
-; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b64 v[2:3], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v8i8:
@@ -1416,44 +1403,34 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 {
; GFX11-TRUE16-LABEL: void_func_v16i8:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l
-; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v15.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v11.l
; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v8.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v10.h, v15.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v13.l
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v6.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v0.h, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v2.l, v1.h
+; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14
-; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14
-; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0
+; GFX11-TRUE16-NEXT: buffer_store_b128 v[8:11], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: void_func_v16i8:
@@ -1649,78 +1626,59 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 {
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v12.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v10.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v9.l
+; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l
; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l
-; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v6.h, v5.h
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.l, v7.h
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.l, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v11.l, v10.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v12.l, v11.h
+; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.l, v12.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.l, v13.h
+; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v14.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v16.h, v15.h
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v16.l, v17.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v9.h
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16
-; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h
; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32
-; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32
+; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v10.l, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v8.h
; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0
; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0
; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index f67ab18..234eaa8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -4985,21 +4985,17 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
-; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1
@@ -5243,18 +5239,14 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 4
+; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
; GFX11-TRUE16-NEXT: s_clause 0x1
-; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off
-; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off
+; GFX11-TRUE16-NEXT: global_store_b8 v[2:3], v4, off
+; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
@@ -5528,27 +5520,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4
; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4
-; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
-; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
-; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off
+; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[3:4], off
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4
+; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1
+; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0
; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33
; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2
; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1
@@ -5994,73 +5980,53 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l
; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l
-; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l
+; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h
; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l
; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l
-; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l
-; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l
-; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l
-; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l
-; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l
-; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l
-; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l
-; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.h, v2.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v4.h
+; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.l, v3.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v30.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l
+; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v28.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v26.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l
+; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.h, v0.h
+; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v2.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v25.l
+; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v24.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v23.l
+; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v21.l
+; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v19.l
+; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v18.l
+; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v17.l
+; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v16.l
+; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v0.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v1.l
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.h, v4.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v6.l
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.h, v7.l
+; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v5.l
; GFX11-TRUE16-NEXT: s_clause 0x1
; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off
-; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off
+; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[9:12], off
; GFX11-TRUE16-NEXT: s_clause 0x3
; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33
; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 049663a..f80d50b 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -2730,18 +2730,15 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l
; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v0.h
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
diff --git a/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
new file mode 100644
index 0000000..a4aad57
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sched.group.classification.mir
@@ -0,0 +1,59 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=machine-scheduler -o - %s | FileCheck %s
+
+---
+name: buffer_load_lds_not_valu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $vgpr0_vgpr1
+ ; CHECK-LABEL: name: buffer_load_lds_not_valu
+ ; CHECK: liveins: $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; CHECK-NEXT: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF2]], [[DEF3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[DEF3]], [[V_ADD_U32_e32_]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 0
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_]], [[V_ADD_U32_e32_1]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_3:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_1]], [[V_ADD_U32_e32_2]], implicit $exec
+ ; CHECK-NEXT: $m0 = S_MOV_B32 1
+ ; CHECK-NEXT: BUFFER_LOAD_DWORDX4_LDS_OFFEN [[DEF]], [[DEF1]], 0, 0, 0, 0, implicit $exec, implicit $m0
+ ; CHECK-NEXT: [[V_ADD_U32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_2]], [[V_ADD_U32_e32_3]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_5:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_3]], [[V_ADD_U32_e32_4]], implicit $exec
+ ; CHECK-NEXT: [[V_ADD_U32_e32_6:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_4]], [[V_ADD_U32_e32_5]], implicit $exec
+ ; CHECK-NEXT: dead [[V_ADD_U32_e32_7:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 [[V_ADD_U32_e32_5]], [[V_ADD_U32_e32_6]], implicit $exec
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 2, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 4, 1, 0
+ ; CHECK-NEXT: SCHED_GROUP_BARRIER 2, 4, 0
+ ; CHECK-NEXT: S_ENDPGM 0
+ $exec = IMPLICIT_DEF
+ %0:vgpr_32 = IMPLICIT_DEF
+ %1:sgpr_128 = IMPLICIT_DEF
+ %2:vgpr_32 = IMPLICIT_DEF
+ %3:vgpr_32 = IMPLICIT_DEF
+ %4:vgpr_32 = V_ADD_U32_e32 %2, %3, implicit $exec
+ %5:vgpr_32 = V_ADD_U32_e32 %3, %4, implicit $exec
+ $m0 = S_MOV_B32 0
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ $m0 = S_MOV_B32 1
+ BUFFER_LOAD_DWORDX4_LDS_OFFEN %0, %1, 0, 0, 0, 0, implicit $exec, implicit $m0
+ %6:vgpr_32 = V_ADD_U32_e32 %4, %5, implicit $exec
+ %7:vgpr_32 = V_ADD_U32_e32 %5, %6, implicit $exec
+ %8:vgpr_32 = V_ADD_U32_e32 %6, %7, implicit $exec
+ %9:vgpr_32 = V_ADD_U32_e32 %7, %8, implicit $exec
+ %10:vgpr_32 = V_ADD_U32_e32 %8, %9, implicit $exec
+ %11:vgpr_32 = V_ADD_U32_e32 %9, %10, implicit $exec
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 2, 0
+ SCHED_GROUP_BARRIER 4, 1 ,0
+ SCHED_GROUP_BARRIER 2, 4, 0
+ S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
index 9553fcc..f11fe4a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir
@@ -59,6 +59,15 @@ body: |
...
---
+name: src_shared_base_to_vcc
+body: |
+ bb.0:
+ ; GFX9-LABEL: name: src_shared_base_to_vcc
+ ; GFX9: $vcc = S_MOV_B64 $src_shared_base
+ $vcc = COPY $src_shared_base
+...
+
+---
name: sgpr96_aligned_src_dst
body: |
bb.0:
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
index c8fee5d..7cbe5de 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-bitcounts.mir
@@ -119,9 +119,10 @@ body: |
; CHECK: [[R32:%[0-9]+]]:_(s32) = G_SUB [[COUNT]], [[BITDIFF]]
%2(s16) = G_CTLZ %1
- ; CHECK: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
- ; CHECK: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
- ; CHECK: $r0 = COPY [[R]]
+ ; LIBCALLS: [[SHIFTEDR:%[0-9]+]]:_(s32) = G_SHL [[R32]], [[BITDIFF]]
+ ; LIBCALLS: [[R:%[0-9]+]]:_(s32) = G_ASHR [[SHIFTEDR]], [[BITDIFF]]
+ ; LIBCALLS: $r0 = COPY [[R]]
+ ; CLZ: $r0 = COPY [[R32]]
%3(s32) = G_SEXT %2(s16)
$r0 = COPY %3(s32)
BX_RET 14, $noreg, implicit $r0
diff --git a/llvm/test/CodeGen/ARM/carry.ll b/llvm/test/CodeGen/ARM/carry.ll
index 558e2b0..a652241 100644
--- a/llvm/test/CodeGen/ARM/carry.ll
+++ b/llvm/test/CodeGen/ARM/carry.ll
@@ -1,61 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=armv6t2-eabi %s -o - | FileCheck %s
define i64 @f1(i64 %a, i64 %b) {
; CHECK-LABEL: f1:
-; CHECK: subs r
-; CHECK: sbc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: subs r0, r0, r2
+; CHECK-NEXT: sbc r1, r1, r3
+; CHECK-NEXT: bx lr
entry:
- %tmp = sub i64 %a, %b
- ret i64 %tmp
+ %tmp = sub i64 %a, %b
+ ret i64 %tmp
}
define i64 @f2(i64 %a, i64 %b) {
; CHECK-LABEL: f2:
-; CHECK: lsl r
-; CHECK: orr r
-; CHECK: rsbs r
-; CHECK: sbc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsl r1, r1, #1
+; CHECK-NEXT: orr r1, r1, r0, lsr #31
+; CHECK-NEXT: rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT: sbc r1, r1, r3
+; CHECK-NEXT: bx lr
entry:
- %tmp1 = shl i64 %a, 1
- %tmp2 = sub i64 %tmp1, %b
- ret i64 %tmp2
+ %tmp1 = shl i64 %a, 1
+ %tmp2 = sub i64 %tmp1, %b
+ ret i64 %tmp2
}
; add with live carry
define i64 @f3(i32 %al, i32 %bl) {
; CHECK-LABEL: f3:
-; CHECK: adds r
-; CHECK: adc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: adds r0, r0, r1
+; CHECK-NEXT: mov r2, #0
+; CHECK-NEXT: adcs r0, r1, #0
+; CHECK-NEXT: adc r1, r2, #0
+; CHECK-NEXT: bx lr
entry:
- ; unsigned wide add
- %aw = zext i32 %al to i64
- %bw = zext i32 %bl to i64
- %cw = add i64 %aw, %bw
- ; ch == carry bit
- %ch = lshr i64 %cw, 32
- %dw = add i64 %ch, %bw
- ret i64 %dw
+ ; unsigned wide add
+ %aw = zext i32 %al to i64
+ %bw = zext i32 %bl to i64
+ %cw = add i64 %aw, %bw
+ ; ch == carry bit
+ %ch = lshr i64 %cw, 32
+ %dw = add i64 %ch, %bw
+ ret i64 %dw
}
; rdar://10073745
define i64 @f4(i64 %x) nounwind readnone {
-entry:
; CHECK-LABEL: f4:
-; CHECK: rsbs r
-; CHECK: rsc r
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: rsbs r0, r0, #0
+; CHECK-NEXT: rsc r1, r1, #0
+; CHECK-NEXT: bx lr
+entry:
%0 = sub nsw i64 0, %x
ret i64 %0
}
; rdar://12559385
define i64 @f5(i32 %vi) {
-entry:
; CHECK-LABEL: f5:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
- %v0 = zext i32 %vi to i64
- %v1 = xor i64 %v0, -155057456198619
- %v4 = add i64 %v1, 155057456198619
- %v5 = add i64 %v4, %v1
- ret i64 %v5
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r1, #19493
+; CHECK-NEXT: movw r2, #29433
+; CHECK-NEXT: movt r1, #57191
+; CHECK-NEXT: eor r0, r0, r1
+; CHECK-NEXT: movw r3, #46043
+; CHECK-NEXT: movt r2, #65535
+; CHECK-NEXT: adds r0, r0, r0
+; CHECK-NEXT: movw r1, #36102
+; CHECK-NEXT: sbc r2, r2, r1
+; CHECK-NEXT: movt r3, #8344
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adc r1, r2, r1
+; CHECK-NEXT: bx lr
+entry:
+ %v0 = zext i32 %vi to i64
+ %v1 = xor i64 %v0, -155057456198619
+ %v4 = add i64 %v1, 155057456198619
+ %v5 = add i64 %v4, %v1
+ ret i64 %v5
}
diff --git a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
index 1030917..302f70f 100644
--- a/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/return-address-signing.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=aarch64 -run-pass=prologepilog -run-pass=aarch64-ptrauth -o - %s 2>&1 | FileCheck --strict-whitespace %s
--- |
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64"
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
index 1edb387..f345e08 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.alloc.cg1(ptr %addr, i32 %ncols)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
index 2e80c4c..29b130f 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK_PTX64 %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK_PTX64 %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
index 817b1d5..4e463a14 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
define void @test_tcgen05_cp_64x128_v1_cg1(ptr addrspace(6) %addr, i64 %sdesc) {
; CHECK-LABEL: test_tcgen05_cp_64x128_v1_cg1(
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
index cbf647f..fc8cce4 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-fence.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
index a37b1a9..22eb729 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_100a | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mattr=+ptx86 -mcpu=sm_101a | %ptxas-verify -arch=sm_101a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_103a | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mattr=+ptx88 -mcpu=sm_100f | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mattr=+ptx90 -mcpu=sm_110f | %ptxas-verify -arch=sm_110f %}
; CHECK-LABEL: nvvm_tcgen05_ld_16x64b
define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) {
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
index bf2adac..33483b5 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll
@@ -1,8 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
; RUN: llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | FileCheck --check-prefixes=CHECK %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | FileCheck --check-prefixes=CHECK %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_110a && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110a -mattr=+ptx90 | %ptxas-verify -arch=sm_110a %}
declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr)
declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr)
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
index 0636a06..ccf6541 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll
@@ -2,9 +2,13 @@
; RUN: llc < %s -o - -mcpu=sm_100a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_101a -march=nvptx64 -mattr=+ptx86 | FileCheck %s
; RUN: llc < %s -o - -mcpu=sm_103a -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_100f -march=nvptx64 -mattr=+ptx88 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=sm_110f -march=nvptx64 -mattr=+ptx90 | FileCheck %s
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
; RUN: %if ptxas-sm_101a && ptxas-isa-8.6 %{ llc < %s -march=nvptx64 -mcpu=sm_101a -mattr=+ptx86 | %ptxas-verify -arch=sm_101a %}
; RUN: %if ptxas-sm_103a && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_103a -mattr=+ptx88 | %ptxas-verify -arch=sm_103a %}
+; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
+; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -march=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
; CHECK-LABEL: nvvm_tcgen05_st_16x64b
define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32> %stv2, <4 x i32> %stv4, <8 x i32> %stv8, <16 x i32> %stv16, <32 x i32> %stv32, <64 x i32> %stv64, <128 x i32> %stv128) {
diff --git a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
index 2a46a59..4f036d3 100644
--- a/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/and-negpow2-cmp.ll
@@ -221,8 +221,8 @@ define i64 @test12(i64 %0) #0 {
;
; RV64-LABEL: test12:
; RV64: # %bb.0: # %entry
-; RV64-NEXT: addiw a0, a0, -16
-; RV64-NEXT: addi a0, a0, 13
+; RV64-NEXT: addi a0, a0, -16
+; RV64-NEXT: addiw a0, a0, 13
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/RISCV/i64-icmp.ll b/llvm/test/CodeGen/RISCV/i64-icmp.ll
index 88d989d..2742b9a 100644
--- a/llvm/test/CodeGen/RISCV/i64-icmp.ll
+++ b/llvm/test/CodeGen/RISCV/i64-icmp.ll
@@ -708,8 +708,7 @@ define i64 @icmp_sle_constant_neg_2050(i64 %a) nounwind {
define i64 @icmp_eq_zext_inreg_small_constant(i64 %a) nounwind {
; RV64I-LABEL: icmp_eq_zext_inreg_small_constant:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -123
+; RV64I-NEXT: addiw a0, a0, -123
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: ret
%1 = and i64 %a, 4294967295
@@ -748,8 +747,7 @@ define i64 @icmp_ne_zext_inreg_small_constant(i64 %a) nounwind {
define i64 @icmp_ne_zext_inreg_large_constant(i64 %a) nounwind {
; RV64I-LABEL: icmp_ne_zext_inreg_large_constant:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, 2
+; RV64I-NEXT: addiw a0, a0, 2
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: ret
%1 = and i64 %a, 4294967295
diff --git a/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
new file mode 100644
index 0000000..389283a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sifive-xsfmm-vset-insert.mir
@@ -0,0 +1,523 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v \
+# RUN: -run-pass=phi-node-elimination,register-coalescer,riscv-insert-vsetvli | FileCheck %s
+
+--- |
+ define void @xsfmm_same_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 noundef %tm, i64 noundef %tn, i64 noundef %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ ret void
+ }
+
+ define void @xsfmm_different_state(<vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 4)
+ ret void
+ }
+
+ define void @xsfmm_different_state_bf(<vscale x 32 x half> %tile1, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64 2, <vscale x 32 x bfloat> %tile2, <vscale x 32 x bfloat> %tile2, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 2, <vscale x 32 x half> %tile1, <vscale x 32 x half> %tile1, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ ret void
+ }
+
+ define <vscale x 64 x i8> @interleave_rvv_and_xsfmm(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+ entry:
+ %0 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+ %1 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+ call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+ ret <vscale x 64 x i8> %1
+ }
+
+ define <vscale x 64 x i8> @interleave_rvv_and_xsfmm2(<vscale x 64 x i8> %tile, i64 %vl, ptr %base) {
+ entry:
+ %0 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %tile, i64 %vl)
+ %1 = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 1, i64 %vl)
+ %2 = call <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> %tile, <vscale x 64 x i8> %0, i64 %vl)
+ call void @llvm.riscv.sf.vste16.i64(i64 1, ptr %base, i64 %vl)
+ ret <vscale x 64 x i8> %2
+ }
+
+ define void @consecutive_xsfmm(<vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, ptr %base) {
+ entry:
+ tail call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> %tile, <vscale x 32 x half> %tile, i64 %tm, i64 %tn, i64 %tk, i64 2)
+ call void @llvm.riscv.sf.vste16.i64(i64 0, ptr %base, i64 %tn)
+ ret void
+ }
+
+ define i64 @vsettnt_max(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+ %1 = call i64 @llvm.riscv.sf.vsettnt_max.i64(i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettm(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettm.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettn(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettn.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define i64 @single_vsettk(i64 %vl) {
+ entry:
+ %0 = call i64 @llvm.riscv.sf.vsettk.i64(i64 %vl, i64 1, i64 2)
+ ret i64 %0
+ }
+
+ define void @sf_vtzero(i64 %tm, i64 %tn) {
+ entry:
+ call void @llvm.riscv.sf.vtzero.i64(i64 1, i64 %tm, i64 %tn, i64 3, i64 4)
+ ret void
+ }
+
+ declare void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64, <vscale x 32 x half>, <vscale x 32 x half>, i64, i64, i64, i64)
+ declare void @llvm.riscv.sf.mm.f.f.i64.nxv32bf16(i64, <vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i64, i64, i64, i64)
+ declare <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64, i64)
+ declare <vscale x 64 x i8> @llvm.riscv.vadd.nxv64i8.nxv64i8.i64(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i8>, i64)
+ declare void @llvm.riscv.sf.vste16.i64(i64, ptr, i64)
+ declare i64 @llvm.riscv.sf.vsettnt_max.i64(i64, i64)
+ declare i64 @llvm.riscv.sf.vsettm.i64(i64, i64, i64)
+ declare i64 @llvm.riscv.sf.vsettn.i64(i64, i64, i64)
+ declare i64 @llvm.riscv.sf.vsettk.i64(i64, i64, i64)
+ declare void @llvm.riscv.sf.vtzero.i64(i64, i64, i64, i64, i64)
+...
+---
+name: xsfmm_same_state
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_same_state
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoRET
+...
+---
+name: xsfmm_different_state
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_different_state
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1544 /* e16, w4 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 4, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 4, implicit $frm
+ PseudoRET
+...
+---
+name: xsfmm_different_state_bf
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: vrm8 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$v8m8', virtual-reg: '%1' }
+ - { reg: '$x10', virtual-reg: '%2' }
+ - { reg: '$x11', virtual-reg: '%3' }
+ - { reg: '$x12', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-LABEL: name: xsfmm_different_state_bf
+ ; CHECK: liveins: $v8m8, $v16m8, $x10, $x11, $x12
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vrm8 = COPY $v16m8
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1288 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F_ALT $t2, [[COPY3]], [[COPY3]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY2]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY4]], [[COPY4]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %4:gprnox0 = COPY $x12
+ %3:gprnox0 = COPY $x11
+ %2:gprnox0 = COPY $x10
+ %1:vrm8 = COPY $v16m8
+ %0:vrm8 = COPY $v8m8
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F_ALT $t2, %1:vrm8, %1:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %2:gprnox0, %3:gprnox0, %4:gprnox0, 4, 2, implicit $frm
+ PseudoRET
+...
+---
+name: interleave_rvv_and_xsfmm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: vrm8 }
+ - { id: 5, class: vrm8 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11
+ ; CHECK-LABEL: name: interleave_rvv_and_xsfmm
+ ; CHECK: liveins: $v8m8, $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[PseudoSF_VTMV_V_T]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_]], implicit $vtype
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %2:gpr = COPY $x11
+ %1:gprnox0 = COPY $x10
+ %0:vrm8 = COPY $v8m8
+ %3:gpr = ADDI $x0, 1
+ %4:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+ %5:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+ PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+ $v8m8 = COPY %5:vrm8
+ PseudoRET implicit $v8m8
+...
+---
+name: interleave_rvv_and_xsfmm2
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gpr }
+ - { id: 3, class: gpr }
+ - { id: 4, class: vrm8 }
+ - { id: 5, class: vrm8 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11
+ ; CHECK-LABEL: name: interleave_rvv_and_xsfmm2
+ ; CHECK: liveins: $v8m8, $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x11
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 1
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[COPY2]], [[COPY2]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 512 /* e8, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VTMV_V_T:%[0-9]+]]:vrm8 = PseudoSF_VTMV_V_T [[ADDI]], $noreg, 3, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoVSETVLI [[COPY1]], 195 /* e8, m8, ta, ma */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoVADD_VV_M8_1:%[0-9]+]]:vrm8 = PseudoVADD_VV_M8 $noreg, [[PseudoVADD_VV_M8_]], [[PseudoVADD_VV_M8_]], $noreg, 3 /* e8 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[ADDI]], [[COPY]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: $v8m8 = COPY [[PseudoVADD_VV_M8_1]], implicit $vtype
+ ; CHECK-NEXT: PseudoRET implicit $v8m8
+ %2:gpr = COPY $x11
+ %1:gprnox0 = COPY $x10
+ %0:vrm8 = COPY $v8m8
+ %3:gpr = ADDI $x0, 1
+ %4:vrm8 = PseudoVADD_VV_M8 $noreg, %0:vrm8, killed %0:vrm8, %1:gprnox0, 3, 0
+ %5:vrm8 = PseudoSF_VTMV_V_T %3:gpr, %1:gprnox0, 3, 1
+ %6:vrm8 = PseudoVADD_VV_M8 $noreg, %4:vrm8, killed %4:vrm8, %1:gprnox0, 3, 0
+ PseudoSF_VSTE16 %3:gpr, %2:gpr, %1:gprnox0, 4, 1
+ $v8m8 = COPY %6:vrm8
+ PseudoRET implicit $v8m8
+...
+---
+name: consecutive_xsfmm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: vrm8 }
+ - { id: 1, class: gprnox0 }
+ - { id: 2, class: gprnox0 }
+ - { id: 3, class: gprnox0 }
+ - { id: 4, class: gprnox0 }
+liveins:
+ - { reg: '$v8m8', virtual-reg: '%0' }
+ - { reg: '$x10', virtual-reg: '%1' }
+ - { reg: '$x11', virtual-reg: '%2' }
+ - { reg: '$x12', virtual-reg: '%3' }
+ - { reg: '$x13', virtual-reg: '%4' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $v8m8, $x10, $x11, $x12, $x13
+ ; CHECK-LABEL: name: consecutive_xsfmm
+ ; CHECK: liveins: $v8m8, $x10, $x11, $x12, $x13
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:vrm8 = COPY $v8m8
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnox0 = COPY $x12
+ ; CHECK-NEXT: dead [[COPY4:%[0-9]+]]:gprnox0 = COPY $x13
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY2]], 1032 /* e16, w2 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY1]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTK [[COPY3]], 4, 2, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_MM_F_F $t2, [[COPY]], [[COPY]], 7, $noreg, $noreg, $noreg, 4, 2, implicit $frm, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY3]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: PseudoSF_VSTE16 [[COPY1]], [[COPY2]], $noreg, 4, 1, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %0:vrm8 = COPY $v8m8
+ %1:gprnox0 = COPY $x10
+ %2:gprnox0 = COPY $x11
+ %3:gprnox0 = COPY $x12
+ %4:gprnox0 = COPY $x13
+ PseudoSF_MM_F_F $t2, %0:vrm8, %0:vrm8, 7, %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 2, implicit $frm
+ PseudoSF_VSTE16 %1:gprnox0, %2:gprnox0, %3:gprnox0, 4, 1
+ PseudoRET
+...
+---
+name: vsettnt_max
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: vsettnt_max
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_1:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ %2:gprnox0 = PseudoSF_VSETTNTX0 $x0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ %3:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %3:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettm
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettm
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTM:%[0-9]+]]:gprnox0 = PseudoSF_VSETTM [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTM]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTM %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettn
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettn
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[PseudoSF_VSETTNT:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNT [[COPY]], 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTNT]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTNT %0:gprnox0, 520, implicit-def $vl, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: single_vsettk
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10
+ ; CHECK-LABEL: name: single_vsettk
+ ; CHECK: liveins: $x10
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: dead [[PseudoSF_VSETTNTX0_:%[0-9]+]]:gprnox0 = PseudoSF_VSETTNTX0 killed $x0, 520 /* e16, w1 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: [[PseudoSF_VSETTK:%[0-9]+]]:gprnox0 = PseudoSF_VSETTK [[COPY]], 4, 1, implicit-def $vtype, implicit $vtype, implicit $vtype
+ ; CHECK-NEXT: $x10 = COPY [[PseudoSF_VSETTK]]
+ ; CHECK-NEXT: PseudoRET implicit $x10
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = PseudoSF_VSETTK %0:gprnox0, 4, 1, implicit-def $vtype, implicit $vtype
+ $x10 = COPY %1:gprnox0
+ PseudoRET implicit $x10
+...
+---
+name: sf_vtzero
+alignment: 4
+tracksRegLiveness: true
+registers:
+ - { id: 0, class: gprnox0 }
+ - { id: 1, class: gprnox0 }
+liveins:
+ - { reg: '$x10', virtual-reg: '%0' }
+ - { reg: '$x11', virtual-reg: '%1' }
+frameInfo:
+ maxAlignment: 1
+machineFunctionInfo: {}
+body: |
+ bb.0.entry:
+ liveins: $x10, $x11
+ ; CHECK-LABEL: name: sf_vtzero
+ ; CHECK: liveins: $x10, $x11
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprnox0 = COPY $x10
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnox0 = COPY $x11
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTNT [[COPY1]], 1536 /* e8, w4 */, implicit-def $vl, implicit-def $vtype
+ ; CHECK-NEXT: dead $x0 = PseudoSF_VSETTM [[COPY]], 3, 3, implicit-def $vtype, implicit $vtype
+ ; CHECK-NEXT: PseudoSF_VTZERO_T $t1, $noreg, $noreg, 3, 4, implicit $vl, implicit $vtype
+ ; CHECK-NEXT: PseudoRET
+ %0:gprnox0 = COPY $x10
+ %1:gprnox0 = COPY $x11
+ PseudoSF_VTZERO_T $t1, %0:gprnox0, %1:gprnox0, 3, 4
+ PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
index 2f03ff9..318268a 100644
--- a/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
+++ b/llvm/test/CodeGen/RISCV/select-to-and-zext.ll
@@ -15,8 +15,7 @@ define i32 @from_cmpeq(i32 %xx, i32 %y) {
;
; RV64I-LABEL: from_cmpeq:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -9
+; RV64I-NEXT: addiw a0, a0, -9
; RV64I-NEXT: seqz a0, a0
; RV64I-NEXT: and a0, a0, a1
; RV64I-NEXT: ret
@@ -39,8 +38,7 @@ define i32 @from_cmpeq_fail_bad_andmask(i32 %xx, i32 %y) {
;
; RV64I-LABEL: from_cmpeq_fail_bad_andmask:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a0, a0, -9
+; RV64I-NEXT: addiw a0, a0, -9
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: addi a0, a0, -1
; RV64I-NEXT: and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/setcc-logic.ll b/llvm/test/CodeGen/RISCV/setcc-logic.ll
index fabb573..4e14893 100644
--- a/llvm/test/CodeGen/RISCV/setcc-logic.ll
+++ b/llvm/test/CodeGen/RISCV/setcc-logic.ll
@@ -104,9 +104,8 @@ define i1 @and_icmps_const_not1bit_diff(i32 %x) nounwind {
;
; RV64I-LABEL: and_icmps_const_not1bit_diff:
; RV64I: # %bb.0:
-; RV64I-NEXT: sext.w a0, a0
-; RV64I-NEXT: addi a1, a0, -44
-; RV64I-NEXT: addi a0, a0, -92
+; RV64I-NEXT: addiw a1, a0, -44
+; RV64I-NEXT: addiw a0, a0, -92
; RV64I-NEXT: snez a1, a1
; RV64I-NEXT: snez a0, a0
; RV64I-NEXT: and a0, a1, a0
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index bdbe4ed..07bfbe6 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -674,8 +674,7 @@ define i32 @sext_of_not_cmp_i32(i32 %x) {
;
; RV64-LABEL: sext_of_not_cmp_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a0, a0
-; RV64-NEXT: addi a0, a0, -7
+; RV64-NEXT: addiw a0, a0, -7
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: addi a0, a0, -1
; RV64-NEXT: ret
@@ -718,8 +717,7 @@ define i32 @dec_of_zexted_cmp_i32(i32 %x) {
;
; RV64-LABEL: dec_of_zexted_cmp_i32:
; RV64: # %bb.0:
-; RV64-NEXT: sext.w a0, a0
-; RV64-NEXT: addi a0, a0, -7
+; RV64-NEXT: addiw a0, a0, -7
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: addi a0, a0, -1
; RV64-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll
index 2751332c..bf6802d 100644
--- a/llvm/test/CodeGen/RISCV/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/xaluo.ll
@@ -1047,8 +1047,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64-LABEL: usubo.i32.constant.lhs:
; RV64: # %bb.0: # %entry
; RV64-NEXT: li a2, -2
-; RV64-NEXT: subw a2, a2, a0
-; RV64-NEXT: addi a0, a2, 1
+; RV64-NEXT: sub a2, a2, a0
+; RV64-NEXT: addiw a0, a2, 1
; RV64-NEXT: seqz a0, a0
; RV64-NEXT: sw a2, 0(a1)
; RV64-NEXT: ret
@@ -1065,8 +1065,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64ZBA-LABEL: usubo.i32.constant.lhs:
; RV64ZBA: # %bb.0: # %entry
; RV64ZBA-NEXT: li a2, -2
-; RV64ZBA-NEXT: subw a2, a2, a0
-; RV64ZBA-NEXT: addi a0, a2, 1
+; RV64ZBA-NEXT: sub a2, a2, a0
+; RV64ZBA-NEXT: addiw a0, a2, 1
; RV64ZBA-NEXT: seqz a0, a0
; RV64ZBA-NEXT: sw a2, 0(a1)
; RV64ZBA-NEXT: ret
@@ -1083,8 +1083,8 @@ define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
; RV64ZICOND-LABEL: usubo.i32.constant.lhs:
; RV64ZICOND: # %bb.0: # %entry
; RV64ZICOND-NEXT: li a2, -2
-; RV64ZICOND-NEXT: subw a2, a2, a0
-; RV64ZICOND-NEXT: addi a0, a2, 1
+; RV64ZICOND-NEXT: sub a2, a2, a0
+; RV64ZICOND-NEXT: addiw a0, a2, 1
; RV64ZICOND-NEXT: seqz a0, a0
; RV64ZICOND-NEXT: sw a2, 0(a1)
; RV64ZICOND-NEXT: ret
diff --git a/llvm/test/CodeGen/Thumb2/carry.ll b/llvm/test/CodeGen/Thumb2/carry.ll
index 1e2b332..47c7918 100644
--- a/llvm/test/CodeGen/Thumb2/carry.ll
+++ b/llvm/test/CodeGen/Thumb2/carry.ll
@@ -1,35 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
define i64 @f1(i64 %a, i64 %b) {
-entry:
; CHECK-LABEL: f1:
-; CHECK: subs r0, r0, r2
-; CHECK: sbcs r1, r3
- %tmp = sub i64 %a, %b
- ret i64 %tmp
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: subs r0, r0, r2
+; CHECK-NEXT: sbcs r1, r3
+; CHECK-NEXT: bx lr
+entry:
+ %tmp = sub i64 %a, %b
+ ret i64 %tmp
}
define i64 @f2(i64 %a, i64 %b) {
-entry:
; CHECK-LABEL: f2:
-; CHECK: lsls r1, r1, #1
-; CHECK: orr.w r1, r1, r0, lsr #31
-; CHECK: rsbs r0, r2, r0, lsl #1
-; CHECK: sbcs r1, r3
- %tmp1 = shl i64 %a, 1
- %tmp2 = sub i64 %tmp1, %b
- ret i64 %tmp2
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: lsls r1, r1, #1
+; CHECK-NEXT: orr.w r1, r1, r0, lsr #31
+; CHECK-NEXT: rsbs r0, r2, r0, lsl #1
+; CHECK-NEXT: sbcs r1, r3
+; CHECK-NEXT: bx lr
+entry:
+ %tmp1 = shl i64 %a, 1
+ %tmp2 = sub i64 %tmp1, %b
+ ret i64 %tmp2
}
; rdar://12559385
define i64 @f3(i32 %vi) {
-entry:
; CHECK-LABEL: f3:
-; CHECK: movw [[REG:r[0-9]+]], #36102
-; CHECK: sbcs r{{[0-9]+}}, [[REG]]
- %v0 = zext i32 %vi to i64
- %v1 = xor i64 %v0, -155057456198619
- %v4 = add i64 %v1, 155057456198619
- %v5 = add i64 %v4, %v1
- ret i64 %v5
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: movw r1, #19493
+; CHECK-NEXT: movt r1, #57191
+; CHECK-NEXT: eors r0, r1
+; CHECK-NEXT: movw r2, #29433
+; CHECK-NEXT: movw r3, #46043
+; CHECK-NEXT: movw r1, #36102
+; CHECK-NEXT: movt r2, #65535
+; CHECK-NEXT: adds r0, r0, r0
+; CHECK-NEXT: movt r3, #8344
+; CHECK-NEXT: sbcs r2, r1
+; CHECK-NEXT: adds r0, r0, r3
+; CHECK-NEXT: adcs r1, r2
+; CHECK-NEXT: bx lr
+entry:
+ %v0 = zext i32 %vi to i64
+ %v1 = xor i64 %v0, -155057456198619
+ %v4 = add i64 %v1, 155057456198619
+ %v5 = add i64 %v4, %v1
+ ret i64 %v5
}
diff --git a/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
new file mode 100644
index 0000000..abbd953
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/mem-intrinsics-offsets.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mcpu=mvp -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s
+
+; This test ensures that loads and stores generated for small memcpy et al use
+; constant offset folding.
+
+
+target triple = "wasm32-unknown-unknown"
+
+define void @call_memset(ptr) #0 {
+; CHECK-LABEL: call_memset:
+; CHECK: .functype call_memset (i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.const $push0=, 0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.const $push1=, 0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memset.p0.i32(ptr align 1 %0, i8 0, i32 16, i1 false)
+ ret void
+}
+
+define void @call_memcpy(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memcpy:
+; CHECK: .functype call_memcpy (i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.load $push1=, 0($1):p2align=0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $pop1
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memcpy.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+ ret void
+}
+
+
+define void @call_memmove(ptr %dst, ptr %src) #0 {
+; CHECK-LABEL: call_memmove:
+; CHECK: .functype call_memmove (i32, i32) -> ()
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: i64.load $2=, 0($1):p2align=0
+; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0
+; CHECK-NEXT: i64.store 8($0):p2align=0, $pop0
+; CHECK-NEXT: i64.store 0($0):p2align=0, $2
+; CHECK-NEXT: # fallthrough-return
+ call void @llvm.memmove.p0.p0.i32(ptr align 1 %dst, ptr align 1 %src, i32 16, i1 false)
+ ret void
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
new file mode 100644
index 0000000..3654aae
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-dot-reductions.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define <4 x i32> @dot_sext_1(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_1:
+; CHECK: .functype dot_sext_1 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+
+define <4 x i32> @dot_sext_2(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_sext_2:
+; CHECK: .functype dot_sext_2 (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle2, %shuffle1
+ ret <4 x i32> %res
+}
+
+define <4 x i32> @dot_sext_self(<8 x i16> %v) {
+; CHECK-LABEL: dot_sext_self:
+; CHECK: .functype dot_sext_self (v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: i32x4.dot_i16x8_s
+; CHECK-NEXT: # fallthrough-return
+ %sext = sext <8 x i16> %v to <8 x i32>
+ %mul = mul <8 x i32> %sext, %sext
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_zext(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_zext:
+; CHECK: .functype dot_zext (v128, v128) -> (v128)
+; CHECK-NEXT: .local v128
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_low_i16x8_u
+; CHECK-NEXT: local.tee 2
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_high_i16x8_u
+; CHECK-NEXT: local.tee 1
+; CHECK-NEXT: i8x16.shuffle 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT: local.get 2
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %zext1 = zext <8 x i16> %a to <8 x i32>
+ %zext2 = zext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %zext1, %zext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
+
+; INFO: Negative test
+define <4 x i32> @dot_wrong_shuffle(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: dot_wrong_shuffle:
+; CHECK: .functype dot_wrong_shuffle (v128, v128) -> (v128)
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_low_i16x8_s
+; CHECK-NEXT: local.get 0
+; CHECK-NEXT: local.get 1
+; CHECK-NEXT: i32x4.extmul_high_i16x8_s
+; CHECK-NEXT: i32x4.add
+; CHECK-NEXT: # fallthrough-return
+ %sext1 = sext <8 x i16> %a to <8 x i32>
+ %sext2 = sext <8 x i16> %b to <8 x i32>
+ %mul = mul <8 x i32> %sext1, %sext2
+ %shuffle1 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %shuffle2 = shufflevector <8 x i32> %mul, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+ %res = add <4 x i32> %shuffle1, %shuffle2
+ ret <4 x i32> %res
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
index e065de3..600241a 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fma.ll
@@ -2,9 +2,278 @@
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128,+relaxed-simd | FileCheck %s --check-prefix=RELAXED
; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+fp16,+simd128, | FileCheck %s --check-prefix=STRICT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=NOFP16
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NOSIMD
target triple = "wasm32"
+define half @fadd_fmul_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f16:
+; RELAXED: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fadd_fmul_contract_f16:
+; STRICT: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $0
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $1
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fadd_fmul_contract_f16:
+; NOFP16: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f16:
+; NOSIMD: .functype fadd_fmul_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %mul = fmul contract half %b, %a
+ %add = fadd contract half %mul, %c
+ ret half %add
+}
+
+define half @fmuladd_contract_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_contract_f16:
+; RELAXED: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fmuladd_contract_f16:
+; STRICT: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $1
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $0
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fmuladd_contract_f16:
+; NOFP16: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fmuladd_contract_f16:
+; NOSIMD: .functype fmuladd_contract_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %fma = call contract half @llvm.fmuladd(half %a, half %b, half %c)
+ ret half %fma
+}
+
+define half @fmuladd_f16(half %a, half %b, half %c) {
+; RELAXED-LABEL: fmuladd_f16:
+; RELAXED: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: call $push0=, __truncsfhf2, $1
+; RELAXED-NEXT: call $push1=, __extendhfsf2, $pop0
+; RELAXED-NEXT: call $push2=, __truncsfhf2, $0
+; RELAXED-NEXT: call $push3=, __extendhfsf2, $pop2
+; RELAXED-NEXT: f32.mul $push4=, $pop1, $pop3
+; RELAXED-NEXT: call $push5=, __truncsfhf2, $2
+; RELAXED-NEXT: call $push6=, __extendhfsf2, $pop5
+; RELAXED-NEXT: f32.add $push7=, $pop4, $pop6
+; RELAXED-NEXT: return $pop7
+;
+; STRICT-LABEL: fmuladd_f16:
+; STRICT: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: call $push0=, __truncsfhf2, $1
+; STRICT-NEXT: call $push1=, __extendhfsf2, $pop0
+; STRICT-NEXT: call $push2=, __truncsfhf2, $0
+; STRICT-NEXT: call $push3=, __extendhfsf2, $pop2
+; STRICT-NEXT: f32.mul $push4=, $pop1, $pop3
+; STRICT-NEXT: call $push5=, __truncsfhf2, $2
+; STRICT-NEXT: call $push6=, __extendhfsf2, $pop5
+; STRICT-NEXT: f32.add $push7=, $pop4, $pop6
+; STRICT-NEXT: return $pop7
+;
+; NOFP16-LABEL: fmuladd_f16:
+; NOFP16: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $0
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: return $pop7
+;
+; NOSIMD-LABEL: fmuladd_f16:
+; NOSIMD: .functype fmuladd_f16 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $0
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: return $pop7
+ %fma = call half @llvm.fmuladd(half %a, half %b, half %c)
+ ret half %fma
+}
+
+
+define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fadd_fmul_contract_f32:
+; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $1, $0
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fadd_fmul_contract_f32:
+; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $1, $0
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f32:
+; NOFP16: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $1, $0
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f32:
+; NOSIMD: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $1, $0
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %mul = fmul contract float %b, %a
+ %add = fadd contract float %mul, %c
+ ret float %add
+}
+
+define float @fmuladd_contract_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_contract_f32:
+; RELAXED: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $0, $1
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f32:
+; STRICT: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $0, $1
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f32:
+; NOFP16: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $0, $1
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f32:
+; NOSIMD: .functype fmuladd_contract_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $0, $1
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call contract float @llvm.fmuladd(float %a, float %b, float %c)
+ ret float %fma
+}
+
+define float @fmuladd_f32(float %a, float %b, float %c) {
+; RELAXED-LABEL: fmuladd_f32:
+; RELAXED: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32.mul $push0=, $0, $1
+; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_f32:
+; STRICT: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32.mul $push0=, $0, $1
+; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_f32:
+; NOFP16: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32.mul $push0=, $0, $1
+; NOFP16-NEXT: f32.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f32:
+; NOSIMD: .functype fmuladd_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $0, $1
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call float @llvm.fmuladd(float %a, float %b, float %c)
+ ret float %fma
+}
+
define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
; RELAXED-LABEL: fadd_fmul_contract_f64:
; RELAXED: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
@@ -19,16 +288,94 @@ define double @fadd_fmul_contract_f64(double %a, double %b, double %c) {
; STRICT-NEXT: f64.mul $push0=, $1, $0
; STRICT-NEXT: f64.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_f64:
+; NOFP16: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $1, $0
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_f64:
+; NOSIMD: .functype fadd_fmul_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $1, $0
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
%mul = fmul contract double %b, %a
%add = fadd contract double %mul, %c
ret double %add
}
+define double @fmuladd_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_f64:
+; RELAXED: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64.mul $push0=, $0, $1
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_f64:
+; STRICT: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64.mul $push0=, $0, $1
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_f64:
+; NOFP16: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $0, $1
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_f64:
+; NOSIMD: .functype fmuladd_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $0, $1
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call double @llvm.fmuladd(double %a, double %b, double %c)
+ ret double %fma
+}
+
+define double @fmuladd_contract_f64(double %a, double %b, double %c) {
+; RELAXED-LABEL: fmuladd_contract_f64:
+; RELAXED: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64.mul $push0=, $0, $1
+; RELAXED-NEXT: f64.add $push1=, $pop0, $2
+; RELAXED-NEXT: return $pop1
+;
+; STRICT-LABEL: fmuladd_contract_f64:
+; STRICT: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64.mul $push0=, $0, $1
+; STRICT-NEXT: f64.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_f64:
+; NOFP16: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64.mul $push0=, $0, $1
+; NOFP16-NEXT: f64.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_f64:
+; NOSIMD: .functype fmuladd_contract_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $0, $1
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $2
+; NOSIMD-NEXT: return $pop1
+ %fma = call contract double @llvm.fmuladd(double %a, double %b, double %c)
+ ret double %fma
+}
+
define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fadd_fmul_contract_4xf32:
; RELAXED: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_4xf32:
@@ -37,31 +384,222 @@ define <4 x float> @fadd_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
; STRICT-NEXT: f32x4.mul $push0=, $1, $0
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_4xf32:
+; NOFP16: .functype fadd_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_4xf32:
+; NOSIMD: .functype fadd_fmul_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $4
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $3
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $2
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $1
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%mul = fmul contract <4 x float> %b, %a
%add = fadd contract <4 x float> %mul, %c
ret <4 x float> %add
}
-
define <8 x half> @fadd_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
; RELAXED-LABEL: fadd_fmul_contract_8xf16:
; RELAXED: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f16x8.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f16x8.madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_8xf16:
; STRICT: .functype fadd_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f16x8.mul $push0=, $1, $0
-; STRICT-NEXT: f16x8.add $push1=, $pop0, $2
-; STRICT-NEXT: return $pop1
+; STRICT-NEXT: f16x8.madd $push0=, $1, $0, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf16:
+; NOFP16: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf16:
+; NOSIMD: .functype fadd_fmul_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
%mul = fmul contract <8 x half> %b, %a
%add = fadd contract <8 x half> %mul, %c
ret <8 x half> %add
}
-
define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fadd_fmul_4xf32:
; RELAXED: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
@@ -76,16 +614,412 @@ define <4 x float> @fadd_fmul_4xf32(<4 x float> %a, <4 x float> %b, <4 x float>
; STRICT-NEXT: f32x4.mul $push0=, $1, $0
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_4xf32:
+; NOFP16: .functype fadd_fmul_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $1, $0
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_4xf32:
+; NOSIMD: .functype fadd_fmul_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $4
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $3
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $2
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $1
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%mul = fmul <4 x float> %b, %a
%add = fadd contract <4 x float> %mul, %c
ret <4 x float> %add
}
+define <8 x half> @fmuladd_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_contract_8xf16:
+; RELAXED: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_8xf16:
+; STRICT: .functype fmuladd_contract_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fmuladd_contract_8xf16:
+; NOFP16: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_contract_8xf16:
+; NOSIMD: .functype fmuladd_contract_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
+ %fma = call contract <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.madd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fmuladd_8xf16:
+; NOFP16: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, __truncsfhf2, $16
+; NOFP16-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOFP16-NEXT: call $push2=, __truncsfhf2, $8
+; NOFP16-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOFP16-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOFP16-NEXT: call $push5=, __truncsfhf2, $24
+; NOFP16-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOFP16-NEXT: f32.add $push7=, $pop4, $pop6
+; NOFP16-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOFP16-NEXT: i32.store16 14($0), $pop8
+; NOFP16-NEXT: call $push9=, __truncsfhf2, $15
+; NOFP16-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOFP16-NEXT: call $push11=, __truncsfhf2, $7
+; NOFP16-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOFP16-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOFP16-NEXT: call $push14=, __truncsfhf2, $23
+; NOFP16-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOFP16-NEXT: f32.add $push16=, $pop13, $pop15
+; NOFP16-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOFP16-NEXT: i32.store16 12($0), $pop17
+; NOFP16-NEXT: call $push18=, __truncsfhf2, $14
+; NOFP16-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOFP16-NEXT: call $push20=, __truncsfhf2, $6
+; NOFP16-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOFP16-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOFP16-NEXT: call $push23=, __truncsfhf2, $22
+; NOFP16-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOFP16-NEXT: f32.add $push25=, $pop22, $pop24
+; NOFP16-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOFP16-NEXT: i32.store16 10($0), $pop26
+; NOFP16-NEXT: call $push27=, __truncsfhf2, $13
+; NOFP16-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOFP16-NEXT: call $push29=, __truncsfhf2, $5
+; NOFP16-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOFP16-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOFP16-NEXT: call $push32=, __truncsfhf2, $21
+; NOFP16-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOFP16-NEXT: f32.add $push34=, $pop31, $pop33
+; NOFP16-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOFP16-NEXT: i32.store16 8($0), $pop35
+; NOFP16-NEXT: call $push36=, __truncsfhf2, $12
+; NOFP16-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOFP16-NEXT: call $push38=, __truncsfhf2, $4
+; NOFP16-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOFP16-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOFP16-NEXT: call $push41=, __truncsfhf2, $20
+; NOFP16-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOFP16-NEXT: f32.add $push43=, $pop40, $pop42
+; NOFP16-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOFP16-NEXT: i32.store16 6($0), $pop44
+; NOFP16-NEXT: call $push45=, __truncsfhf2, $11
+; NOFP16-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOFP16-NEXT: call $push47=, __truncsfhf2, $3
+; NOFP16-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOFP16-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOFP16-NEXT: call $push50=, __truncsfhf2, $19
+; NOFP16-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOFP16-NEXT: f32.add $push52=, $pop49, $pop51
+; NOFP16-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOFP16-NEXT: i32.store16 4($0), $pop53
+; NOFP16-NEXT: call $push54=, __truncsfhf2, $10
+; NOFP16-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOFP16-NEXT: call $push56=, __truncsfhf2, $2
+; NOFP16-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOFP16-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOFP16-NEXT: call $push59=, __truncsfhf2, $18
+; NOFP16-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOFP16-NEXT: f32.add $push61=, $pop58, $pop60
+; NOFP16-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOFP16-NEXT: i32.store16 2($0), $pop62
+; NOFP16-NEXT: call $push63=, __truncsfhf2, $9
+; NOFP16-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOFP16-NEXT: call $push65=, __truncsfhf2, $1
+; NOFP16-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOFP16-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOFP16-NEXT: call $push68=, __truncsfhf2, $17
+; NOFP16-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOFP16-NEXT: f32.add $push70=, $pop67, $pop69
+; NOFP16-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOFP16-NEXT: i32.store16 0($0), $pop71
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_8xf16:
+; NOSIMD: .functype fmuladd_8xf16 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, __truncsfhf2, $16
+; NOSIMD-NEXT: call $push1=, __extendhfsf2, $pop0
+; NOSIMD-NEXT: call $push2=, __truncsfhf2, $8
+; NOSIMD-NEXT: call $push3=, __extendhfsf2, $pop2
+; NOSIMD-NEXT: f32.mul $push4=, $pop1, $pop3
+; NOSIMD-NEXT: call $push5=, __truncsfhf2, $24
+; NOSIMD-NEXT: call $push6=, __extendhfsf2, $pop5
+; NOSIMD-NEXT: f32.add $push7=, $pop4, $pop6
+; NOSIMD-NEXT: call $push8=, __truncsfhf2, $pop7
+; NOSIMD-NEXT: i32.store16 14($0), $pop8
+; NOSIMD-NEXT: call $push9=, __truncsfhf2, $15
+; NOSIMD-NEXT: call $push10=, __extendhfsf2, $pop9
+; NOSIMD-NEXT: call $push11=, __truncsfhf2, $7
+; NOSIMD-NEXT: call $push12=, __extendhfsf2, $pop11
+; NOSIMD-NEXT: f32.mul $push13=, $pop10, $pop12
+; NOSIMD-NEXT: call $push14=, __truncsfhf2, $23
+; NOSIMD-NEXT: call $push15=, __extendhfsf2, $pop14
+; NOSIMD-NEXT: f32.add $push16=, $pop13, $pop15
+; NOSIMD-NEXT: call $push17=, __truncsfhf2, $pop16
+; NOSIMD-NEXT: i32.store16 12($0), $pop17
+; NOSIMD-NEXT: call $push18=, __truncsfhf2, $14
+; NOSIMD-NEXT: call $push19=, __extendhfsf2, $pop18
+; NOSIMD-NEXT: call $push20=, __truncsfhf2, $6
+; NOSIMD-NEXT: call $push21=, __extendhfsf2, $pop20
+; NOSIMD-NEXT: f32.mul $push22=, $pop19, $pop21
+; NOSIMD-NEXT: call $push23=, __truncsfhf2, $22
+; NOSIMD-NEXT: call $push24=, __extendhfsf2, $pop23
+; NOSIMD-NEXT: f32.add $push25=, $pop22, $pop24
+; NOSIMD-NEXT: call $push26=, __truncsfhf2, $pop25
+; NOSIMD-NEXT: i32.store16 10($0), $pop26
+; NOSIMD-NEXT: call $push27=, __truncsfhf2, $13
+; NOSIMD-NEXT: call $push28=, __extendhfsf2, $pop27
+; NOSIMD-NEXT: call $push29=, __truncsfhf2, $5
+; NOSIMD-NEXT: call $push30=, __extendhfsf2, $pop29
+; NOSIMD-NEXT: f32.mul $push31=, $pop28, $pop30
+; NOSIMD-NEXT: call $push32=, __truncsfhf2, $21
+; NOSIMD-NEXT: call $push33=, __extendhfsf2, $pop32
+; NOSIMD-NEXT: f32.add $push34=, $pop31, $pop33
+; NOSIMD-NEXT: call $push35=, __truncsfhf2, $pop34
+; NOSIMD-NEXT: i32.store16 8($0), $pop35
+; NOSIMD-NEXT: call $push36=, __truncsfhf2, $12
+; NOSIMD-NEXT: call $push37=, __extendhfsf2, $pop36
+; NOSIMD-NEXT: call $push38=, __truncsfhf2, $4
+; NOSIMD-NEXT: call $push39=, __extendhfsf2, $pop38
+; NOSIMD-NEXT: f32.mul $push40=, $pop37, $pop39
+; NOSIMD-NEXT: call $push41=, __truncsfhf2, $20
+; NOSIMD-NEXT: call $push42=, __extendhfsf2, $pop41
+; NOSIMD-NEXT: f32.add $push43=, $pop40, $pop42
+; NOSIMD-NEXT: call $push44=, __truncsfhf2, $pop43
+; NOSIMD-NEXT: i32.store16 6($0), $pop44
+; NOSIMD-NEXT: call $push45=, __truncsfhf2, $11
+; NOSIMD-NEXT: call $push46=, __extendhfsf2, $pop45
+; NOSIMD-NEXT: call $push47=, __truncsfhf2, $3
+; NOSIMD-NEXT: call $push48=, __extendhfsf2, $pop47
+; NOSIMD-NEXT: f32.mul $push49=, $pop46, $pop48
+; NOSIMD-NEXT: call $push50=, __truncsfhf2, $19
+; NOSIMD-NEXT: call $push51=, __extendhfsf2, $pop50
+; NOSIMD-NEXT: f32.add $push52=, $pop49, $pop51
+; NOSIMD-NEXT: call $push53=, __truncsfhf2, $pop52
+; NOSIMD-NEXT: i32.store16 4($0), $pop53
+; NOSIMD-NEXT: call $push54=, __truncsfhf2, $10
+; NOSIMD-NEXT: call $push55=, __extendhfsf2, $pop54
+; NOSIMD-NEXT: call $push56=, __truncsfhf2, $2
+; NOSIMD-NEXT: call $push57=, __extendhfsf2, $pop56
+; NOSIMD-NEXT: f32.mul $push58=, $pop55, $pop57
+; NOSIMD-NEXT: call $push59=, __truncsfhf2, $18
+; NOSIMD-NEXT: call $push60=, __extendhfsf2, $pop59
+; NOSIMD-NEXT: f32.add $push61=, $pop58, $pop60
+; NOSIMD-NEXT: call $push62=, __truncsfhf2, $pop61
+; NOSIMD-NEXT: i32.store16 2($0), $pop62
+; NOSIMD-NEXT: call $push63=, __truncsfhf2, $9
+; NOSIMD-NEXT: call $push64=, __extendhfsf2, $pop63
+; NOSIMD-NEXT: call $push65=, __truncsfhf2, $1
+; NOSIMD-NEXT: call $push66=, __extendhfsf2, $pop65
+; NOSIMD-NEXT: f32.mul $push67=, $pop64, $pop66
+; NOSIMD-NEXT: call $push68=, __truncsfhf2, $17
+; NOSIMD-NEXT: call $push69=, __extendhfsf2, $pop68
+; NOSIMD-NEXT: f32.add $push70=, $pop67, $pop69
+; NOSIMD-NEXT: call $push71=, __truncsfhf2, $pop70
+; NOSIMD-NEXT: i32.store16 0($0), $pop71
+; NOSIMD-NEXT: return
+ %fma = call <8 x half> @llvm.fmuladd(<8 x half> %a, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fmuladd_contract_4xf32:
; RELAXED: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $2, $0, $1
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fmuladd_contract_4xf32:
@@ -94,18 +1028,40 @@ define <4 x float> @fmuladd_contract_4xf32(<4 x float> %a, <4 x float> %b, <4 x
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_4xf32:
+; NOFP16: .functype fmuladd_contract_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_4xf32:
+; NOSIMD: .functype fmuladd_contract_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $4, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $3, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $2, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $1, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%fma = call contract <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
-; TODO: This should also have relaxed_madd in RELAXED case
define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fmuladd_4xf32:
; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.mul $push0=, $0, $1
-; RELAXED-NEXT: f32x4.add $push1=, $pop0, $2
-; RELAXED-NEXT: return $pop1
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fmuladd_4xf32:
; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
@@ -113,10 +1069,170 @@ define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c
; STRICT-NEXT: f32x4.mul $push0=, $0, $1
; STRICT-NEXT: f32x4.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_4xf32:
+; NOFP16: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $0, $1
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_4xf32:
+; NOSIMD: .functype fmuladd_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $4, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $3, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $2, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $1, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop7
+; NOSIMD-NEXT: return
%fma = call <4 x float> @llvm.fmuladd(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
+define <8 x float> @fmuladd_8xf32(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
+; RELAXED-LABEL: fmuladd_8xf32:
+; RELAXED: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32x4.mul $push0=, $2, $4
+; RELAXED-NEXT: f32x4.add $push1=, $pop0, $6
+; RELAXED-NEXT: v128.store 16($0), $pop1
+; RELAXED-NEXT: f32x4.mul $push2=, $1, $3
+; RELAXED-NEXT: f32x4.add $push3=, $pop2, $5
+; RELAXED-NEXT: v128.store 0($0), $pop3
+; RELAXED-NEXT: return
+;
+; STRICT-LABEL: fmuladd_8xf32:
+; STRICT: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32x4.mul $push0=, $2, $4
+; STRICT-NEXT: f32x4.add $push1=, $pop0, $6
+; STRICT-NEXT: v128.store 16($0), $pop1
+; STRICT-NEXT: f32x4.mul $push2=, $1, $3
+; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
+; STRICT-NEXT: v128.store 0($0), $pop3
+; STRICT-NEXT: return
+;
+; NOFP16-LABEL: fmuladd_8xf32:
+; NOFP16: .functype fmuladd_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $2, $4
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT: v128.store 16($0), $pop1
+; NOFP16-NEXT: f32x4.mul $push2=, $1, $3
+; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT: v128.store 0($0), $pop3
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fmuladd_8xf32:
+; NOSIMD: .functype fmuladd_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $8, $16
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT: f32.store 28($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $7, $15
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT: f32.store 24($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $6, $14
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT: f32.store 20($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $5, $13
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT: f32.store 16($0), $pop7
+; NOSIMD-NEXT: f32.mul $push8=, $4, $12
+; NOSIMD-NEXT: f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT: f32.store 12($0), $pop9
+; NOSIMD-NEXT: f32.mul $push10=, $3, $11
+; NOSIMD-NEXT: f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT: f32.store 8($0), $pop11
+; NOSIMD-NEXT: f32.mul $push12=, $2, $10
+; NOSIMD-NEXT: f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT: f32.store 4($0), $pop13
+; NOSIMD-NEXT: f32.mul $push14=, $1, $9
+; NOSIMD-NEXT: f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT: f32.store 0($0), $pop15
+; NOSIMD-NEXT: return
+ %fma = call <8 x float> @llvm.fmuladd(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+ ret <8 x float> %fma
+}
+
+define <2 x double> @fmuladd_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_contract_2xf64:
+; RELAXED: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_contract_2xf64:
+; STRICT: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_contract_2xf64:
+; NOFP16: .functype fmuladd_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_contract_2xf64:
+; NOSIMD: .functype fmuladd_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $2, $4
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $1, $3
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %fma = call contract <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
+; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fmuladd_2xf64:
+; NOFP16: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $0, $1
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fmuladd_2xf64:
+; NOSIMD: .functype fmuladd_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $2, $4
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $1, $3
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %fma = call <2 x double> @llvm.fmuladd(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
+
define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; RELAXED-LABEL: fma_4xf32:
; RELAXED: .functype fma_4xf32 (v128, v128, v128) -> (v128)
@@ -167,6 +1283,44 @@ define <4 x float> @fma_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
; STRICT-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
; STRICT-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
; STRICT-NEXT: return $pop19
+;
+; NOFP16-LABEL: fma_4xf32:
+; NOFP16: .functype fma_4xf32 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.extract_lane $push2=, $0, 0
+; NOFP16-NEXT: f32x4.extract_lane $push1=, $1, 0
+; NOFP16-NEXT: f32x4.extract_lane $push0=, $2, 0
+; NOFP16-NEXT: call $push3=, fmaf, $pop2, $pop1, $pop0
+; NOFP16-NEXT: f32x4.splat $push4=, $pop3
+; NOFP16-NEXT: f32x4.extract_lane $push7=, $0, 1
+; NOFP16-NEXT: f32x4.extract_lane $push6=, $1, 1
+; NOFP16-NEXT: f32x4.extract_lane $push5=, $2, 1
+; NOFP16-NEXT: call $push8=, fmaf, $pop7, $pop6, $pop5
+; NOFP16-NEXT: f32x4.replace_lane $push9=, $pop4, 1, $pop8
+; NOFP16-NEXT: f32x4.extract_lane $push12=, $0, 2
+; NOFP16-NEXT: f32x4.extract_lane $push11=, $1, 2
+; NOFP16-NEXT: f32x4.extract_lane $push10=, $2, 2
+; NOFP16-NEXT: call $push13=, fmaf, $pop12, $pop11, $pop10
+; NOFP16-NEXT: f32x4.replace_lane $push14=, $pop9, 2, $pop13
+; NOFP16-NEXT: f32x4.extract_lane $push17=, $0, 3
+; NOFP16-NEXT: f32x4.extract_lane $push16=, $1, 3
+; NOFP16-NEXT: f32x4.extract_lane $push15=, $2, 3
+; NOFP16-NEXT: call $push18=, fmaf, $pop17, $pop16, $pop15
+; NOFP16-NEXT: f32x4.replace_lane $push19=, $pop14, 3, $pop18
+; NOFP16-NEXT: return $pop19
+;
+; NOSIMD-LABEL: fma_4xf32:
+; NOSIMD: .functype fma_4xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fmaf, $4, $8, $12
+; NOSIMD-NEXT: f32.store 12($0), $pop0
+; NOSIMD-NEXT: call $push1=, fmaf, $3, $7, $11
+; NOSIMD-NEXT: f32.store 8($0), $pop1
+; NOSIMD-NEXT: call $push2=, fmaf, $2, $6, $10
+; NOSIMD-NEXT: f32.store 4($0), $pop2
+; NOSIMD-NEXT: call $push3=, fmaf, $1, $5, $9
+; NOSIMD-NEXT: f32.store 0($0), $pop3
+; NOSIMD-NEXT: return
%fma = call <4 x float> @llvm.fma(<4 x float> %a, <4 x float> %b, <4 x float> %c)
ret <4 x float> %fma
}
@@ -176,9 +1330,9 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; RELAXED-LABEL: fadd_fmul_contract_8xf32:
; RELAXED: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $6, $4, $2
+; RELAXED-NEXT: f32x4.relaxed_madd $push0=, $4, $2, $6
; RELAXED-NEXT: v128.store 16($0), $pop0
-; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $5, $3, $1
+; RELAXED-NEXT: f32x4.relaxed_madd $push1=, $3, $1, $5
; RELAXED-NEXT: v128.store 0($0), $pop1
; RELAXED-NEXT: return
;
@@ -192,17 +1346,56 @@ define <8 x float> @fadd_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; STRICT-NEXT: f32x4.add $push3=, $pop2, $5
; STRICT-NEXT: v128.store 0($0), $pop3
; STRICT-NEXT: return
+;
+; NOFP16-LABEL: fadd_fmul_contract_8xf32:
+; NOFP16: .functype fadd_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f32x4.mul $push0=, $4, $2
+; NOFP16-NEXT: f32x4.add $push1=, $pop0, $6
+; NOFP16-NEXT: v128.store 16($0), $pop1
+; NOFP16-NEXT: f32x4.mul $push2=, $3, $1
+; NOFP16-NEXT: f32x4.add $push3=, $pop2, $5
+; NOFP16-NEXT: v128.store 0($0), $pop3
+; NOFP16-NEXT: return
+;
+; NOSIMD-LABEL: fadd_fmul_contract_8xf32:
+; NOSIMD: .functype fadd_fmul_contract_8xf32 (i32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f32.mul $push0=, $16, $8
+; NOSIMD-NEXT: f32.add $push1=, $pop0, $24
+; NOSIMD-NEXT: f32.store 28($0), $pop1
+; NOSIMD-NEXT: f32.mul $push2=, $15, $7
+; NOSIMD-NEXT: f32.add $push3=, $pop2, $23
+; NOSIMD-NEXT: f32.store 24($0), $pop3
+; NOSIMD-NEXT: f32.mul $push4=, $14, $6
+; NOSIMD-NEXT: f32.add $push5=, $pop4, $22
+; NOSIMD-NEXT: f32.store 20($0), $pop5
+; NOSIMD-NEXT: f32.mul $push6=, $13, $5
+; NOSIMD-NEXT: f32.add $push7=, $pop6, $21
+; NOSIMD-NEXT: f32.store 16($0), $pop7
+; NOSIMD-NEXT: f32.mul $push8=, $12, $4
+; NOSIMD-NEXT: f32.add $push9=, $pop8, $20
+; NOSIMD-NEXT: f32.store 12($0), $pop9
+; NOSIMD-NEXT: f32.mul $push10=, $11, $3
+; NOSIMD-NEXT: f32.add $push11=, $pop10, $19
+; NOSIMD-NEXT: f32.store 8($0), $pop11
+; NOSIMD-NEXT: f32.mul $push12=, $10, $2
+; NOSIMD-NEXT: f32.add $push13=, $pop12, $18
+; NOSIMD-NEXT: f32.store 4($0), $pop13
+; NOSIMD-NEXT: f32.mul $push14=, $9, $1
+; NOSIMD-NEXT: f32.add $push15=, $pop14, $17
+; NOSIMD-NEXT: f32.store 0($0), $pop15
+; NOSIMD-NEXT: return
%mul = fmul contract <8 x float> %b, %a
%add = fadd contract <8 x float> %mul, %c
ret <8 x float> %add
}
-
define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
; RELAXED-LABEL: fadd_fmul_contract_2xf64:
; RELAXED: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $2, $1, $0
+; RELAXED-NEXT: f64x2.relaxed_madd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fadd_fmul_contract_2xf64:
@@ -211,28 +1404,64 @@ define <2 x double> @fadd_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
; STRICT-NEXT: f64x2.mul $push0=, $1, $0
; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
+;
+; NOFP16-LABEL: fadd_fmul_contract_2xf64:
+; NOFP16: .functype fadd_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_contract_2xf64:
+; NOSIMD: .functype fadd_fmul_contract_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $4, $2
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $3, $1
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
%mul = fmul contract <2 x double> %b, %a
%add = fadd contract <2 x double> %mul, %c
ret <2 x double> %add
}
-define float @fadd_fmul_contract_f32(float %a, float %b, float %c) {
-; RELAXED-LABEL: fadd_fmul_contract_f32:
-; RELAXED: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+define <2 x double> @fadd_fmul_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fadd_fmul_2xf64:
+; RELAXED: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32.mul $push0=, $1, $0
-; RELAXED-NEXT: f32.add $push1=, $pop0, $2
+; RELAXED-NEXT: f64x2.mul $push0=, $1, $0
+; RELAXED-NEXT: f64x2.add $push1=, $pop0, $2
; RELAXED-NEXT: return $pop1
;
-; STRICT-LABEL: fadd_fmul_contract_f32:
-; STRICT: .functype fadd_fmul_contract_f32 (f32, f32, f32) -> (f32)
+; STRICT-LABEL: fadd_fmul_2xf64:
+; STRICT: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f32.mul $push0=, $1, $0
-; STRICT-NEXT: f32.add $push1=, $pop0, $2
+; STRICT-NEXT: f64x2.mul $push0=, $1, $0
+; STRICT-NEXT: f64x2.add $push1=, $pop0, $2
; STRICT-NEXT: return $pop1
- %mul = fmul contract float %b, %a
- %add = fadd contract float %mul, %c
- ret float %add
+;
+; NOFP16-LABEL: fadd_fmul_2xf64:
+; NOFP16: .functype fadd_fmul_2xf64 (v128, v128, v128) -> (v128)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: f64x2.mul $push0=, $1, $0
+; NOFP16-NEXT: f64x2.add $push1=, $pop0, $2
+; NOFP16-NEXT: return $pop1
+;
+; NOSIMD-LABEL: fadd_fmul_2xf64:
+; NOSIMD: .functype fadd_fmul_2xf64 (i32, f64, f64, f64, f64, f64, f64) -> ()
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: f64.mul $push0=, $4, $2
+; NOSIMD-NEXT: f64.add $push1=, $pop0, $6
+; NOSIMD-NEXT: f64.store 8($0), $pop1
+; NOSIMD-NEXT: f64.mul $push2=, $3, $1
+; NOSIMD-NEXT: f64.add $push3=, $pop2, $5
+; NOSIMD-NEXT: f64.store 0($0), $pop3
+; NOSIMD-NEXT: return
+ %mul = fmul <2 x double> %b, %a
+ %add = fadd <2 x double> %mul, %c
+ ret <2 x double> %add
}
define float @fma_f32(float %a, float %b, float %c) {
@@ -247,6 +1476,18 @@ define float @fma_f32(float %a, float %b, float %c) {
; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fmaf, $0, $1, $2
; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fma_f32:
+; NOFP16: .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, fmaf, $0, $1, $2
+; NOFP16-NEXT: return $pop0
+;
+; NOSIMD-LABEL: fma_f32:
+; NOSIMD: .functype fma_f32 (f32, f32, f32) -> (f32)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fmaf, $0, $1, $2
+; NOSIMD-NEXT: return $pop0
%fma = call float @llvm.fma(float %a, float %b, float %c)
ret float %fma
}
@@ -263,6 +1504,18 @@ define double @fma_f64(double %a, double %b, double %c) {
; STRICT-NEXT: # %bb.0:
; STRICT-NEXT: call $push0=, fma, $0, $1, $2
; STRICT-NEXT: return $pop0
+;
+; NOFP16-LABEL: fma_f64:
+; NOFP16: .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOFP16-NEXT: # %bb.0:
+; NOFP16-NEXT: call $push0=, fma, $0, $1, $2
+; NOFP16-NEXT: return $pop0
+;
+; NOSIMD-LABEL: fma_f64:
+; NOSIMD: .functype fma_f64 (f64, f64, f64) -> (f64)
+; NOSIMD-NEXT: # %bb.0:
+; NOSIMD-NEXT: call $push0=, fma, $0, $1, $2
+; NOSIMD-NEXT: return $pop0
%fma = call double @llvm.fma(double %a, double %b, double %c)
ret double %fma
}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
index 6e2d860..b90c1da 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fnma.ll
@@ -27,7 +27,7 @@ define <4 x float> @fsub_fmul_contract_4xf32(<4 x float> %a, <4 x float> %b, <4
; RELAXED-LABEL: fsub_fmul_contract_4xf32:
; RELAXED: .functype fsub_fmul_contract_4xf32 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_4xf32:
@@ -46,15 +46,14 @@ define <8 x half> @fsub_fmul_contract_8xf16(<8 x half> %a, <8 x half> %b, <8 x h
; RELAXED-LABEL: fsub_fmul_contract_8xf16:
; RELAXED: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f16x8.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f16x8.nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_8xf16:
; STRICT: .functype fsub_fmul_contract_8xf16 (v128, v128, v128) -> (v128)
; STRICT-NEXT: # %bb.0:
-; STRICT-NEXT: f16x8.mul $push0=, $1, $0
-; STRICT-NEXT: f16x8.sub $push1=, $2, $pop0
-; STRICT-NEXT: return $pop1
+; STRICT-NEXT: f16x8.nmadd $push0=, $1, $0, $2
+; STRICT-NEXT: return $pop0
%mul = fmul contract <8 x half> %b, %a
%sub = fsub contract <8 x half> %c, %mul
ret <8 x half> %sub
@@ -84,9 +83,9 @@ define <8 x float> @fsub_fmul_contract_8xf32(<8 x float> %a, <8 x float> %b, <8
; RELAXED-LABEL: fsub_fmul_contract_8xf32:
; RELAXED: .functype fsub_fmul_contract_8xf32 (i32, v128, v128, v128, v128, v128, v128) -> ()
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $6, $4, $2
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $4, $2, $6
; RELAXED-NEXT: v128.store 16($0), $pop0
-; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $5, $3, $1
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push1=, $3, $1, $5
; RELAXED-NEXT: v128.store 0($0), $pop1
; RELAXED-NEXT: return
;
@@ -110,7 +109,7 @@ define <2 x double> @fsub_fmul_contract_2xf64(<2 x double> %a, <2 x double> %b,
; RELAXED-LABEL: fsub_fmul_contract_2xf64:
; RELAXED: .functype fsub_fmul_contract_2xf64 (v128, v128, v128) -> (v128)
; RELAXED-NEXT: # %bb.0:
-; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $2, $1, $0
+; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $1, $0, $2
; RELAXED-NEXT: return $pop0
;
; STRICT-LABEL: fsub_fmul_contract_2xf64:
@@ -143,3 +142,55 @@ define float @fsub_fmul_contract_f32(float %a, float %b, float %c) {
ret float %sub
}
+define <8 x half> @fmuladd_8xf16(<8 x half> %a, <8 x half> %b, <8 x half> %c) {
+; RELAXED-LABEL: fmuladd_8xf16:
+; RELAXED: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f16x8.nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_8xf16:
+; STRICT: .functype fmuladd_8xf16 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f16x8.nmadd $push0=, $0, $1, $2
+; STRICT-NEXT: return $pop0
+ %fneg = fneg <8 x half> %a
+ %fma = call <8 x half> @llvm.fmuladd(<8 x half> %fneg, <8 x half> %b, <8 x half> %c)
+ ret <8 x half> %fma
+}
+
+define <4 x float> @fmuladd_4xf32(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; RELAXED-LABEL: fmuladd_4xf32:
+; RELAXED: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f32x4.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_4xf32:
+; STRICT: .functype fmuladd_4xf32 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f32x4.mul $push0=, $0, $1
+; STRICT-NEXT: f32x4.sub $push1=, $2, $pop0
+; STRICT-NEXT: return $pop1
+ %fneg = fneg <4 x float> %a
+ %fma = call <4 x float> @llvm.fmuladd(<4 x float> %fneg, <4 x float> %b, <4 x float> %c)
+ ret <4 x float> %fma
+}
+
+define <2 x double> @fmuladd_2xf64(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
+; RELAXED-LABEL: fmuladd_2xf64:
+; RELAXED: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; RELAXED-NEXT: # %bb.0:
+; RELAXED-NEXT: f64x2.relaxed_nmadd $push0=, $0, $1, $2
+; RELAXED-NEXT: return $pop0
+;
+; STRICT-LABEL: fmuladd_2xf64:
+; STRICT: .functype fmuladd_2xf64 (v128, v128, v128) -> (v128)
+; STRICT-NEXT: # %bb.0:
+; STRICT-NEXT: f64x2.mul $push0=, $0, $1
+; STRICT-NEXT: f64x2.sub $push1=, $2, $pop0
+; STRICT-NEXT: return $pop1
+ %fneg = fneg <2 x double> %a
+ %fma = call <2 x double> @llvm.fmuladd(<2 x double> %fneg, <2 x double> %b, <2 x double> %c)
+ ret <2 x double> %fma
+}
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 0de308a..5152c005 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -728,45 +728,70 @@ define void @avg_v32i8_2(ptr %a, ptr %b) nounwind {
define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: avg_v64i8_2:
; SSE2: # %bb.0:
-; SSE2-NEXT: movaps (%rsi), %xmm0
-; SSE2-NEXT: movaps 16(%rsi), %xmm1
-; SSE2-NEXT: movaps 32(%rsi), %xmm2
-; SSE2-NEXT: movaps 48(%rsi), %xmm3
-; SSE2-NEXT: movups %xmm3, (%rax)
-; SSE2-NEXT: movups %xmm2, (%rax)
-; SSE2-NEXT: movups %xmm1, (%rax)
-; SSE2-NEXT: movups %xmm0, (%rax)
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movdqa 16(%rdi), %xmm1
+; SSE2-NEXT: movdqa 32(%rdi), %xmm2
+; SSE2-NEXT: movdqa 48(%rdi), %xmm3
+; SSE2-NEXT: pavgb (%rsi), %xmm0
+; SSE2-NEXT: pavgb 16(%rsi), %xmm1
+; SSE2-NEXT: pavgb 32(%rsi), %xmm2
+; SSE2-NEXT: pavgb 48(%rsi), %xmm3
+; SSE2-NEXT: movdqu %xmm3, (%rax)
+; SSE2-NEXT: movdqu %xmm2, (%rax)
+; SSE2-NEXT: movdqu %xmm1, (%rax)
+; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_2:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rsi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rsi), %ymm1
-; AVX1-NEXT: vmovups %ymm1, (%rax)
-; AVX1-NEXT: vmovups %ymm0, (%rax)
-; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vpavgb (%rsi), %xmm0, %xmm0
+; AVX1-NEXT: vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX1-NEXT: vpavgb 32(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vpavgb 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqu %xmm3, (%rax)
+; AVX1-NEXT: vmovdqu %xmm2, (%rax)
+; AVX1-NEXT: vmovdqu %xmm1, (%rax)
+; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_2:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovaps (%rsi), %ymm0
-; AVX2-NEXT: vmovaps 32(%rsi), %ymm1
-; AVX2-NEXT: vmovups %ymm1, (%rax)
-; AVX2-NEXT: vmovups %ymm0, (%rax)
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-NEXT: vpavgb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
+; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512-LABEL: avg_v64i8_2:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rsi), %zmm0
-; AVX512-NEXT: vmovups %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
+; AVX512F-LABEL: avg_v64i8_2:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512F-NEXT: vpavgb (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vpavgb 32(%rsi), %ymm1, %ymm1
+; AVX512F-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512F-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512BW-LABEL: avg_v64i8_2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpavgb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%1 = load <64 x i8>, ptr %a
%2 = load <64 x i8>, ptr %b
%3 = zext <64 x i8> %1 to <64 x i32>
%4 = zext <64 x i8> %2 to <64 x i32>
- %5 = add nuw nsw <64 x i32> %4, %4
+ %5 = add nuw nsw <64 x i32> %3, %4
%6 = add nuw nsw <64 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = lshr <64 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%8 = trunc <64 x i32> %7 to <64 x i8>
@@ -774,7 +799,6 @@ define void @avg_v64i8_2(ptr %a, ptr %b) nounwind {
ret void
}
-
define void @avg_v4i16_2(ptr %a, ptr %b) nounwind {
; SSE2-LABEL: avg_v4i16_2:
; SSE2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
index a0c243b..f3950b7 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition-with-dap.ll
@@ -1,16 +1,15 @@
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
-;; A minimal test case. llc will crash if global variables already has a section
-;; prefix. Subsequent PRs will expand on this test case to test the hotness
-;; reconciliation implementation.
-
-; RUN: not llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
+;; A minimal test case. Subsequent PRs will expand on this test case
+;; (e.g., with more functions, variables and profiles) and test the hotness
+;; reconcillation implementation.
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic \
; RUN: -partition-static-data-sections=true \
; RUN: -data-sections=true -unique-section-names=false \
-; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=ERR
+; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=IR
-; ERR: Global variable hot_bss already has a section prefix hot
+; IR: .section .bss.hot.,"aw"
@hot_bss = internal global i32 0, !section_prefix !17
diff --git a/llvm/test/CodeGen/X86/global-variable-partition.ll b/llvm/test/CodeGen/X86/global-variable-partition.ll
index ce06d17..604b4fd 100644
--- a/llvm/test/CodeGen/X86/global-variable-partition.ll
+++ b/llvm/test/CodeGen/X86/global-variable-partition.ll
@@ -106,23 +106,31 @@ target triple = "x86_64-unknown-linux-gnu"
; UNIQ-NEXT: .section .data.unlikely.,"aw",@progbits,unique,8
; AGG-NEXT: .section .data.unlikely.,"aw",@progbits
+;; The `.section` directive is omitted for .data with -unique-section-names=false.
+; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
; For @data_with_unknown_hotness
; SYM: .type .Ldata_with_unknown_hotness,@object # @data_with_unknown_hotness
; SYM: .section .data..Ldata_with_unknown_hotness,"aw",@progbits
; UNIQ: .section .data,"aw",@progbits,unique,9
-; The `.section` directive is omitted for .data with -unique-section-names=false.
-; See MCSectionELF::shouldOmitSectionDirective for the implementation details.
+
; AGG: .data
; COMMON: .Ldata_with_unknown_hotness:
-; For @hot_data_custom_bar_section
-; It has an explicit section attribute 'var' and shouldn't have hot or unlikely suffix.
+; For variables that are not eligible for section prefix annotation
; COMMON: .type hot_data_custom_bar_section,@object
; SYM-NEXT: .section bar,"aw",@progbits
; SYM: hot_data_custom_bar_section
; UNIQ: .section bar,"aw",@progbits
; AGG: .section bar,"aw",@progbits
+; SYM: .section .data.llvm.fake_var,"aw"
+; UNIQ: .section .data,"aw"
+; AGG: .data
+
+;; No section for linker declaration
+; COMMON-NOT: qux
+
@.str = private unnamed_addr constant [5 x i8] c"hot\09\00", align 1
@.str.1 = private unnamed_addr constant [10 x i8] c"%d\09%d\09%d\0A\00", align 1
@hot_relro_array = internal constant [2 x ptr] [ptr @bss2, ptr @data3]
@@ -137,6 +145,8 @@ target triple = "x86_64-unknown-linux-gnu"
@data3 = internal global i32 3
@data_with_unknown_hotness = private global i32 5
@hot_data_custom_bar_section = internal global i32 101 #0
+@llvm.fake_var = internal global i32 123
+@qux = external global i64
define void @cold_func(i32 %0) !prof !15 {
%2 = load i32, ptr @cold_bss
diff --git a/llvm/test/CodeGen/X86/relptr-rodata.ll b/llvm/test/CodeGen/X86/relptr-rodata.ll
index ea22b08..954ea8f 100644
--- a/llvm/test/CodeGen/X86/relptr-rodata.ll
+++ b/llvm/test/CodeGen/X86/relptr-rodata.ll
@@ -10,16 +10,31 @@ target triple = "x86_64-unknown-linux-gnu"
; CHECK: .long hidden-rodata
@rodata = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @rodata to i64)) to i32)
+; CHECK: .section .rodata.rodata_ptrtoaddr
+; CHECK: rodata_ptrtoaddr:
+; CHECK: .long hidden-rodata_ptrtoaddr
+@rodata_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @rodata_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .data.rel.ro.relro1
; CHECK: relro1:
; CHECK: .long default-relro1
@relro1 = hidden constant i32 trunc (i64 sub (i64 ptrtoint (ptr @default to i64), i64 ptrtoint (ptr @relro1 to i64)) to i32)
+; CHECK: .section .data.rel.ro.relro1_ptrtoaddr
+; CHECK: relro1_ptrtoaddr:
+; CHECK: .long default-relro1_ptrtoaddr
+@relro1_ptrtoaddr = hidden constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @default to i64), i64 ptrtoaddr (ptr @relro1_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .data.rel.ro.relro2
; CHECK: relro2:
; CHECK: .long hidden-relro2
@relro2 = constant i32 trunc (i64 sub (i64 ptrtoint (ptr @hidden to i64), i64 ptrtoint (ptr @relro2 to i64)) to i32)
+; CHECK: .section .data.rel.ro.relro2_ptrtoaddr
+; CHECK: relro2_ptrtoaddr:
+; CHECK: .long hidden-relro2_ptrtoaddr
+@relro2_ptrtoaddr = constant i32 trunc (i64 sub (i64 ptrtoaddr (ptr @hidden to i64), i64 ptrtoaddr (ptr @relro2_ptrtoaddr to i64)) to i32)
+
; CHECK: .section .rodata.obj
; CHECK-NEXT: .globl obj
; CHECK: obj:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 5aa266d..69abf6e 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -1447,3 +1447,158 @@ define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
%r = icmp eq i512 %a, %b
ret i1 %r
}
+
+; Tests for any/allbits from memory.
+
+define i1 @anybits_i128_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i128_load_arg:
+; ANY: # %bb.0:
+; ANY-NEXT: movq (%rdi), %rax
+; ANY-NEXT: orq 8(%rdi), %rax
+; ANY-NEXT: setne %al
+; ANY-NEXT: retq
+ %ld = load i128, ptr %w
+ %cmp = icmp ne i128 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i128_load_arg(ptr %w) {
+; SSE2-LABEL: allbits_i128_load_arg:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqb (%rdi), %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT: sete %al
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: allbits_i128_load_arg:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa (%rdi), %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: ptest %xmm1, %xmm0
+; SSE41-NEXT: setb %al
+; SSE41-NEXT: retq
+;
+; AVXANY-LABEL: allbits_i128_load_arg:
+; AVXANY: # %bb.0:
+; AVXANY-NEXT: vmovdqa (%rdi), %xmm0
+; AVXANY-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVXANY-NEXT: vptest %xmm1, %xmm0
+; AVXANY-NEXT: setb %al
+; AVXANY-NEXT: retq
+ %ld = load i128, ptr %w
+ %cmp = icmp eq i128 %ld, -1
+ ret i1 %cmp
+}
+
+define i1 @anybits_i256_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i256_load_arg:
+; ANY: # %bb.0:
+; ANY-NEXT: movq (%rdi), %rax
+; ANY-NEXT: movq 8(%rdi), %rcx
+; ANY-NEXT: orq 24(%rdi), %rcx
+; ANY-NEXT: orq 16(%rdi), %rax
+; ANY-NEXT: orq %rcx, %rax
+; ANY-NEXT: setne %al
+; ANY-NEXT: retq
+ %ld = load i256, ptr %w
+ %cmp = icmp ne i256 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i256_load_arg(ptr %w) {
+; SSE-LABEL: allbits_i256_load_arg:
+; SSE: # %bb.0:
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: andq 24(%rdi), %rcx
+; SSE-NEXT: andq 16(%rdi), %rax
+; SSE-NEXT: andq %rcx, %rax
+; SSE-NEXT: cmpq $-1, %rax
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: allbits_i256_load_arg:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqu (%rdi), %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT: vptest %ymm1, %ymm0
+; AVX1-NEXT: setb %al
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: allbits_i256_load_arg:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vptest %ymm1, %ymm0
+; AVX2-NEXT: setb %al
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: allbits_i256_load_arg:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqu (%rdi), %ymm0
+; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512-NEXT: vptest %ymm1, %ymm0
+; AVX512-NEXT: setb %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i256, ptr %w
+ %cmp = icmp eq i256 %ld, -1
+ ret i1 %cmp
+}
+
+define i1 @anybits_i512_load_arg(ptr %w) {
+; ANY-LABEL: anybits_i512_load_arg:
+; ANY: # %bb.0:
+; ANY-NEXT: movq 16(%rdi), %rax
+; ANY-NEXT: movq (%rdi), %rcx
+; ANY-NEXT: movq 8(%rdi), %rdx
+; ANY-NEXT: movq 24(%rdi), %rsi
+; ANY-NEXT: orq 56(%rdi), %rsi
+; ANY-NEXT: orq 40(%rdi), %rdx
+; ANY-NEXT: orq %rsi, %rdx
+; ANY-NEXT: orq 48(%rdi), %rax
+; ANY-NEXT: orq 32(%rdi), %rcx
+; ANY-NEXT: orq %rax, %rcx
+; ANY-NEXT: orq %rdx, %rcx
+; ANY-NEXT: setne %al
+; ANY-NEXT: retq
+ %ld = load i512, ptr %w
+ %cmp = icmp ne i512 %ld, 0
+ ret i1 %cmp
+}
+
+define i1 @allbits_i512_load_arg(ptr %w) {
+; NO512-LABEL: allbits_i512_load_arg:
+; NO512: # %bb.0:
+; NO512-NEXT: movq 16(%rdi), %rax
+; NO512-NEXT: movq (%rdi), %rcx
+; NO512-NEXT: movq 8(%rdi), %rdx
+; NO512-NEXT: movq 24(%rdi), %rsi
+; NO512-NEXT: andq 56(%rdi), %rsi
+; NO512-NEXT: andq 40(%rdi), %rdx
+; NO512-NEXT: andq %rsi, %rdx
+; NO512-NEXT: andq 48(%rdi), %rax
+; NO512-NEXT: andq 32(%rdi), %rcx
+; NO512-NEXT: andq %rax, %rcx
+; NO512-NEXT: andq %rdx, %rcx
+; NO512-NEXT: cmpq $-1, %rcx
+; NO512-NEXT: sete %al
+; NO512-NEXT: retq
+;
+; AVX512-LABEL: allbits_i512_load_arg:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512-NEXT: vpcmpneqd (%rdi), %zmm0, %k0
+; AVX512-NEXT: kortestw %k0, %k0
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %ld = load i512, ptr %w
+ %cmp = icmp eq i512 %ld, -1
+ ret i1 %cmp
+}
diff --git a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
index 446a84d..ffdc80a 100644
--- a/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
+++ b/llvm/test/DebugInfo/COFF/AArch64/codeview-sve.ll
@@ -101,7 +101,7 @@
; CHECK-NEXT: LocalVariableAddrRange {
; CHECK-NEXT: OffsetStart: .text+0x0
; CHECK-NEXT: ISectStart: 0x0
-; CHECK-NEXT: Range: 0xBC
+; CHECK-NEXT: Range: 0xB8
; CHECK-NEXT: }
; CHECK-NEXT: }
; CHECK-NEXT: ProcEnd {
diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s
index 48aec4b..57da338 100644
--- a/llvm/test/MC/WebAssembly/simd-encodings.s
+++ b/llvm/test/MC/WebAssembly/simd-encodings.s
@@ -917,11 +917,11 @@ main:
# CHECK: f16x8.nearest # encoding: [0xfd,0xb6,0x02]
f16x8.nearest
- # CHECK: f16x8.relaxed_madd # encoding: [0xfd,0xce,0x02]
- f16x8.relaxed_madd
+ # CHECK: f16x8.madd # encoding: [0xfd,0xce,0x02]
+ f16x8.madd
- # CHECK: f16x8.relaxed_nmadd # encoding: [0xfd,0xcf,0x02]
- f16x8.relaxed_nmadd
+ # CHECK: f16x8.nmadd # encoding: [0xfd,0xcf,0x02]
+ f16x8.nmadd
# CHECK: i16x8.trunc_sat_f16x8_s # encoding: [0xfd,0xc5,0x02]
i16x8.trunc_sat_f16x8_s
diff --git a/llvm/test/Other/debugcounter-dce.ll b/llvm/test/Other/debugcounter-dce.ll
index 54d929f..3b1dfb4 100644
--- a/llvm/test/Other/debugcounter-dce.ll
+++ b/llvm/test/Other/debugcounter-dce.ll
@@ -1,8 +1,16 @@
; REQUIRES: asserts
-; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 < %s | FileCheck %s
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 < %s | FileCheck %s --check-prefixes=CHECK,NO-PRINT
+; RUN: opt -passes=dce -S -debug-counter=dce-transform=1-2 -print-debug-counter-queries < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,PRINT
;; Test that, with debug counters on, we will skip the first DCE opportunity, perform next 2,
;; and ignore all the others left.
+; NO-PRINT-NOT: DebugCounter
+; PRINT: DebugCounter dce-transform=0 skip
+; PRINT-NEXT: DebugCounter dce-transform=1 execute
+; PRINT-NEXT: DebugCounter dce-transform=2 execute
+; PRINT-NEXT: DebugCounter dce-transform=3 skip
+; PRINT-NEXT: DebugCounter dce-transform=4 skip
+
; CHECK-LABEL: @test
; CHECK-NEXT: %add1 = add i32 1, 2
; CHECK-NEXT: %sub1 = sub i32 %add1, 1
diff --git a/llvm/test/TableGen/listsplat.td b/llvm/test/TableGen/listsplat.td
index 5a93a4c..43803d6 100644
--- a/llvm/test/TableGen/listsplat.td
+++ b/llvm/test/TableGen/listsplat.td
@@ -1,4 +1,5 @@
// RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
// CHECK: ------------- Classes -----------------
// CHECK-NEXT: class X<int X:a = ?, int X:b = ?> {
@@ -73,3 +74,8 @@ def DYa1 : Y<"a", 1>;
def DYa2 : Y<"a", 2>;
def DZ : X<42, !size([1, 2, 3])>;
+
+#ifdef ERROR1
+// ERROR1: !listsplat count -1 is negative
+defvar E = !listsplat("", -1);
+#endif
diff --git a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
index d25d0f1..4c0f9db 100644
--- a/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
+++ b/llvm/test/Transforms/ExpandFp/AMDGPU/frem.ll
@@ -380,9 +380,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float
; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP38:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE20]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
@@ -396,9 +396,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float
; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP48:%.*]], %[[FREM_ELSE20]] ]
+; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP46:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP38:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]]
; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]])
@@ -408,12 +408,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x half> [[R2]], ptr addrspace(1) [[OUT]], align 8
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 11)
-; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -423,10 +423,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 11
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], half [[TMP27]], half [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
+; CHECK-NEXT: [[TMP38]] = select i1 [[TMP29]], half [[TMP28]], half [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -456,15 +456,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]]
; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]])
; CHECK-NEXT: [[TMP37:%.*]] = fptrunc float [[AX13]] to half
-; CHECK-NEXT: [[TMP38]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP46]] = call half @llvm.copysign.f16(half [[TMP37]], half [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE19]]:
-; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
+; CHECK-NEXT: [[TMP39:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP39]], 0
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP39]], 1
; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP41]], 1
; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP40]], i32 11)
-; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
+; CHECK-NEXT: [[TMP42:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP42]], 0
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP42]], 1
; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP44]], 1
@@ -474,10 +474,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 [[NB25]], 11
; CHECK-NEXT: br i1 [[TMP45]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]]
; CHECK: [[FREM_ELSE20]]:
-; CHECK-NEXT: [[TMP46:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
-; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
-; CHECK-NEXT: [[TMP48]] = select i1 [[TMP47]], half [[TMP46]], half [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
+; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
+; CHECK-NEXT: [[TMP57]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY27]]:
; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ]
; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ]
@@ -507,8 +507,8 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]]
; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]])
; CHECK-NEXT: [[TMP56:%.*]] = fptrunc float [[AX46]] to half
-; CHECK-NEXT: [[TMP57]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP56]], half [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
@@ -532,9 +532,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX1:%.*]] = fpext half [[AX]] to float
; CHECK-NEXT: [[AY2:%.*]] = fpext half [[AY]] to float
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX1]], [[AY2]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP58:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi half [ [[TMP116:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP115:%.*]], %[[FREM_ELSE86]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq half [[TMP2]], 0xH0000
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], half 0xH7E00, half [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call half @llvm.fabs.f16(half [[TMP1]])
@@ -548,9 +548,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX16:%.*]] = fpext half [[AX14]] to float
; CHECK-NEXT: [[AY17:%.*]] = fpext half [[AY15]] to float
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX16]], [[AY17]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP77:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP68:%.*]], %[[FREM_ELSE20]] ]
+; CHECK-NEXT: [[RET18:%.*]] = phi half [ [[TMP104:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP96:%.*]], %[[FREM_ELSE53]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq half [[TMP12]], 0xH0000
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], half 0xH7E00, half [[RET18]]
; CHECK-NEXT: [[TMP17:%.*]] = call half @llvm.fabs.f16(half [[TMP11]])
@@ -564,9 +564,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX49:%.*]] = fpext half [[AX47]] to float
; CHECK-NEXT: [[AY50:%.*]] = fpext half [[AY48]] to float
; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX49]], [[AY50]]
-; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE52:.*]], label %[[FREM_ELSE53:.*]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE19:.*]], label %[[FREM_ELSE20:.*]]
; CHECK: [[BB24:.*]]:
-; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP96:%.*]], %[[FREM_LOOP_EXIT61:.*]] ], [ [[TMP87:%.*]], %[[FREM_ELSE53]] ]
+; CHECK-NEXT: [[RET51:%.*]] = phi half [ [[TMP85:%.*]], %[[FREM_LOOP_EXIT28:.*]] ], [ [[TMP77:%.*]], %[[FREM_ELSE20]] ]
; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq half [[TMP22]], 0xH0000
; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], half 0xH7E00, half [[RET51]]
; CHECK-NEXT: [[TMP27:%.*]] = call half @llvm.fabs.f16(half [[TMP21]])
@@ -580,9 +580,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX82:%.*]] = fpext half [[AX80]] to float
; CHECK-NEXT: [[AY83:%.*]] = fpext half [[AY81]] to float
; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX82]], [[AY83]]
-; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE85:.*]], label %[[FREM_ELSE86:.*]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB34:.*]]:
-; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP115:%.*]], %[[FREM_LOOP_EXIT94:.*]] ], [ [[TMP106:%.*]], %[[FREM_ELSE86]] ]
+; CHECK-NEXT: [[RET84:%.*]] = phi half [ [[TMP66:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP58:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq half [[TMP32]], 0xH0000
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], half 0xH7E00, half [[RET84]]
; CHECK-NEXT: [[TMP37:%.*]] = call half @llvm.fabs.f16(half [[TMP31]])
@@ -592,12 +592,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <4 x half> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
+; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]])
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1
; CHECK-NEXT: [[AX3:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 11)
-; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
+; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]])
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0
; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1
@@ -607,10 +607,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 11
; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP47:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
-; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
-; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], half [[TMP47]], half [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP48:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]])
+; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX82]], [[AY83]]
+; CHECK-NEXT: [[TMP58]] = select i1 [[TMP49]], half [[TMP48]], half [[TMP31]]
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX3]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -640,15 +640,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = select i1 [[CLT10]], float [[AXP11]], float [[AX9]]
; CHECK-NEXT: [[AX13:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX12]], i32 [[EY]])
; CHECK-NEXT: [[TMP57:%.*]] = fptrunc float [[AX13]] to half
-; CHECK-NEXT: [[TMP58]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP66]] = call half @llvm.copysign.f16(half [[TMP57]], half [[TMP31]])
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_COMPUTE19]]:
-; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
+; CHECK-NEXT: [[TMP59:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]])
; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP59]], 0
; CHECK-NEXT: [[TMP61:%.*]] = extractvalue { float, i32 } [[TMP59]], 1
; CHECK-NEXT: [[EX21:%.*]] = sub i32 [[TMP61]], 1
; CHECK-NEXT: [[AX22:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP60]], i32 11)
-; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
+; CHECK-NEXT: [[TMP62:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]])
; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP62]], 0
; CHECK-NEXT: [[TMP64:%.*]] = extractvalue { float, i32 } [[TMP62]], 1
; CHECK-NEXT: [[EY23:%.*]] = sub i32 [[TMP64]], 1
@@ -658,10 +658,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP65:%.*]] = icmp sgt i32 [[NB25]], 11
; CHECK-NEXT: br i1 [[TMP65]], label %[[FREM_LOOP_BODY27:.*]], label %[[FREM_LOOP_EXIT28]]
; CHECK: [[FREM_ELSE20]]:
-; CHECK-NEXT: [[TMP66:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
-; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
-; CHECK-NEXT: [[TMP68]] = select i1 [[TMP67]], half [[TMP66]], half [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP67:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]])
+; CHECK-NEXT: [[TMP68:%.*]] = fcmp oeq float [[AX49]], [[AY50]]
+; CHECK-NEXT: [[TMP77]] = select i1 [[TMP68]], half [[TMP67]], half [[TMP21]]
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_LOOP_BODY27]]:
; CHECK-NEXT: [[NB_IV29:%.*]] = phi i32 [ [[NB25]], %[[FREM_COMPUTE19]] ], [ [[NB_UPDATE37:%.*]], %[[FREM_LOOP_BODY27]] ]
; CHECK-NEXT: [[AX_LOOP_PHI30:%.*]] = phi float [ [[AX22]], %[[FREM_COMPUTE19]] ], [ [[AX_UPDATE36:%.*]], %[[FREM_LOOP_BODY27]] ]
@@ -691,15 +691,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX45:%.*]] = select i1 [[CLT43]], float [[AXP44]], float [[AX42]]
; CHECK-NEXT: [[AX46:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX45]], i32 [[EY23]])
; CHECK-NEXT: [[TMP76:%.*]] = fptrunc float [[AX46]] to half
-; CHECK-NEXT: [[TMP77]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP85]] = call half @llvm.copysign.f16(half [[TMP76]], half [[TMP21]])
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_COMPUTE52]]:
-; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX49]])
+; CHECK-NEXT: [[TMP78:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX16]])
; CHECK-NEXT: [[TMP79:%.*]] = extractvalue { float, i32 } [[TMP78]], 0
; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP78]], 1
; CHECK-NEXT: [[EX54:%.*]] = sub i32 [[TMP80]], 1
; CHECK-NEXT: [[AX55:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP79]], i32 11)
-; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY50]])
+; CHECK-NEXT: [[TMP81:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY17]])
; CHECK-NEXT: [[TMP82:%.*]] = extractvalue { float, i32 } [[TMP81]], 0
; CHECK-NEXT: [[TMP83:%.*]] = extractvalue { float, i32 } [[TMP81]], 1
; CHECK-NEXT: [[EY56:%.*]] = sub i32 [[TMP83]], 1
@@ -709,10 +709,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP84:%.*]] = icmp sgt i32 [[NB58]], 11
; CHECK-NEXT: br i1 [[TMP84]], label %[[FREM_LOOP_BODY60:.*]], label %[[FREM_LOOP_EXIT61]]
; CHECK: [[FREM_ELSE53]]:
-; CHECK-NEXT: [[TMP85:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP21]])
-; CHECK-NEXT: [[TMP86:%.*]] = fcmp oeq float [[AX49]], [[AY50]]
-; CHECK-NEXT: [[TMP87]] = select i1 [[TMP86]], half [[TMP85]], half [[TMP21]]
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP86:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP11]])
+; CHECK-NEXT: [[TMP87:%.*]] = fcmp oeq float [[AX16]], [[AY17]]
+; CHECK-NEXT: [[TMP96]] = select i1 [[TMP87]], half [[TMP86]], half [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY60]]:
; CHECK-NEXT: [[NB_IV62:%.*]] = phi i32 [ [[NB58]], %[[FREM_COMPUTE52]] ], [ [[NB_UPDATE70:%.*]], %[[FREM_LOOP_BODY60]] ]
; CHECK-NEXT: [[AX_LOOP_PHI63:%.*]] = phi float [ [[AX55]], %[[FREM_COMPUTE52]] ], [ [[AX_UPDATE69:%.*]], %[[FREM_LOOP_BODY60]] ]
@@ -742,15 +742,15 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX78:%.*]] = select i1 [[CLT76]], float [[AXP77]], float [[AX75]]
; CHECK-NEXT: [[AX79:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX78]], i32 [[EY56]])
; CHECK-NEXT: [[TMP95:%.*]] = fptrunc float [[AX79]] to half
-; CHECK-NEXT: [[TMP96]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP21]])
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP104]] = call half @llvm.copysign.f16(half [[TMP95]], half [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE85]]:
-; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX82]])
+; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX1]])
; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0
; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1
; CHECK-NEXT: [[EX87:%.*]] = sub i32 [[TMP99]], 1
; CHECK-NEXT: [[AX88:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP98]], i32 11)
-; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY83]])
+; CHECK-NEXT: [[TMP100:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY2]])
; CHECK-NEXT: [[TMP101:%.*]] = extractvalue { float, i32 } [[TMP100]], 0
; CHECK-NEXT: [[TMP102:%.*]] = extractvalue { float, i32 } [[TMP100]], 1
; CHECK-NEXT: [[EY89:%.*]] = sub i32 [[TMP102]], 1
@@ -760,10 +760,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP103:%.*]] = icmp sgt i32 [[NB91]], 11
; CHECK-NEXT: br i1 [[TMP103]], label %[[FREM_LOOP_BODY93:.*]], label %[[FREM_LOOP_EXIT94]]
; CHECK: [[FREM_ELSE86]]:
-; CHECK-NEXT: [[TMP104:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP31]])
-; CHECK-NEXT: [[TMP105:%.*]] = fcmp oeq float [[AX82]], [[AY83]]
-; CHECK-NEXT: [[TMP106]] = select i1 [[TMP105]], half [[TMP104]], half [[TMP31]]
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP105:%.*]] = call half @llvm.copysign.f16(half 0xH0000, half [[TMP1]])
+; CHECK-NEXT: [[TMP106:%.*]] = fcmp oeq float [[AX1]], [[AY2]]
+; CHECK-NEXT: [[TMP115]] = select i1 [[TMP106]], half [[TMP105]], half [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY93]]:
; CHECK-NEXT: [[NB_IV95:%.*]] = phi i32 [ [[NB91]], %[[FREM_COMPUTE85]] ], [ [[NB_UPDATE103:%.*]], %[[FREM_LOOP_BODY93]] ]
; CHECK-NEXT: [[AX_LOOP_PHI96:%.*]] = phi float [ [[AX88]], %[[FREM_COMPUTE85]] ], [ [[AX_UPDATE102:%.*]], %[[FREM_LOOP_BODY93]] ]
@@ -793,8 +793,8 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX111:%.*]] = select i1 [[CLT109]], float [[AXP110]], float [[AX108]]
; CHECK-NEXT: [[AX112:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX111]], i32 [[EY89]])
; CHECK-NEXT: [[TMP114:%.*]] = fptrunc float [[AX112]] to half
-; CHECK-NEXT: [[TMP115]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP31]])
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP116]] = call half @llvm.copysign.f16(half [[TMP114]], half [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
@@ -816,9 +816,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
@@ -830,9 +830,9 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
@@ -842,12 +842,12 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x float> [[R2]], ptr addrspace(1) [[OUT]], align 8
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { float, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { float, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP21]], i32 12)
-; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { float, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { float, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -857,10 +857,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 12
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq float [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], float [[TMP27]], float [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], float [[TMP28]], float [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -889,15 +889,15 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP37]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP45]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
+; CHECK-NEXT: [[TMP38:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { float, i32 } [[TMP38]], 0
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { float, i32 } [[TMP38]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1
; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP39]], i32 12)
-; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
+; CHECK-NEXT: [[TMP41:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP41]], 0
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { float, i32 } [[TMP41]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1
@@ -907,10 +907,10 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 12
; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP45:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
-; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], float [[TMP45]], float [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP46:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq float [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], float [[TMP46]], float [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -939,8 +939,8 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP55]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP56]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -962,9 +962,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call float @llvm.fabs.f32(float [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt float [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP57:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP49:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi float [ [[TMP112:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP111:%.*]], %[[FREM_ELSE78]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq float [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], float 0x7FF8000000000000, float [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.fabs.f32(float [[TMP1]])
@@ -976,9 +976,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call float @llvm.fabs.f32(float [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt float [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP75:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP67:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi float [ [[TMP101:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP93:%.*]], %[[FREM_ELSE47]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq float [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], float 0x7FF8000000000000, float [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call float @llvm.fabs.f32(float [[TMP11]])
@@ -990,9 +990,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX43:%.*]] = call float @llvm.fabs.f32(float [[TMP21]])
; CHECK-NEXT: [[AY44:%.*]] = call float @llvm.fabs.f32(float [[TMP22]])
; CHECK-NEXT: [[TMP23:%.*]] = fcmp ogt float [[AX43]], [[AY44]]
-; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE46:.*]], label %[[FREM_ELSE47:.*]]
+; CHECK-NEXT: br i1 [[TMP23]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB24:.*]]:
-; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP93:%.*]], %[[FREM_LOOP_EXIT55:.*]] ], [ [[TMP85:%.*]], %[[FREM_ELSE47]] ]
+; CHECK-NEXT: [[RET45:%.*]] = phi float [ [[TMP83:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP75:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP25:%.*]] = fcmp ueq float [[TMP22]], 0.000000e+00
; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[TMP25]], float 0x7FF8000000000000, float [[RET45]]
; CHECK-NEXT: [[TMP27:%.*]] = call float @llvm.fabs.f32(float [[TMP21]])
@@ -1004,9 +1004,9 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX74:%.*]] = call float @llvm.fabs.f32(float [[TMP31]])
; CHECK-NEXT: [[AY75:%.*]] = call float @llvm.fabs.f32(float [[TMP32]])
; CHECK-NEXT: [[TMP33:%.*]] = fcmp ogt float [[AX74]], [[AY75]]
-; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE77:.*]], label %[[FREM_ELSE78:.*]]
+; CHECK-NEXT: br i1 [[TMP33]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB34:.*]]:
-; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP111:%.*]], %[[FREM_LOOP_EXIT86:.*]] ], [ [[TMP103:%.*]], %[[FREM_ELSE78]] ]
+; CHECK-NEXT: [[RET76:%.*]] = phi float [ [[TMP65:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP57:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP35:%.*]] = fcmp ueq float [[TMP32]], 0.000000e+00
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP35]], float 0x7FF8000000000000, float [[RET76]]
; CHECK-NEXT: [[TMP37:%.*]] = call float @llvm.fabs.f32(float [[TMP31]])
@@ -1016,12 +1016,12 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <4 x float> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
+; CHECK-NEXT: [[TMP40:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]])
; CHECK-NEXT: [[TMP41:%.*]] = extractvalue { float, i32 } [[TMP40]], 0
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { float, i32 } [[TMP40]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP42]], 1
; CHECK-NEXT: [[AX1:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP41]], i32 12)
-; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
+; CHECK-NEXT: [[TMP43:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]])
; CHECK-NEXT: [[TMP44:%.*]] = extractvalue { float, i32 } [[TMP43]], 0
; CHECK-NEXT: [[TMP45:%.*]] = extractvalue { float, i32 } [[TMP43]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP45]], 1
@@ -1031,10 +1031,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP46:%.*]] = icmp sgt i32 [[NB]], 12
; CHECK-NEXT: br i1 [[TMP46]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP47:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
-; CHECK-NEXT: [[TMP48:%.*]] = fcmp oeq float [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP49]] = select i1 [[TMP48]], float [[TMP47]], float [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP48:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]])
+; CHECK-NEXT: [[TMP49:%.*]] = fcmp oeq float [[AX74]], [[AY75]]
+; CHECK-NEXT: [[TMP57]] = select i1 [[TMP49]], float [[TMP48]], float [[TMP31]]
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi float [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -1063,15 +1063,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd float [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], float [[AXP9]], float [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP57]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP65]] = call float @llvm.copysign.f32(float [[AX11]], float [[TMP31]])
+; CHECK-NEXT: br label %[[BB34]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
+; CHECK-NEXT: [[TMP58:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]])
; CHECK-NEXT: [[TMP59:%.*]] = extractvalue { float, i32 } [[TMP58]], 0
; CHECK-NEXT: [[TMP60:%.*]] = extractvalue { float, i32 } [[TMP58]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP60]], 1
; CHECK-NEXT: [[AX18:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP59]], i32 12)
-; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
+; CHECK-NEXT: [[TMP61:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]])
; CHECK-NEXT: [[TMP62:%.*]] = extractvalue { float, i32 } [[TMP61]], 0
; CHECK-NEXT: [[TMP63:%.*]] = extractvalue { float, i32 } [[TMP61]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP63]], 1
@@ -1081,10 +1081,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP64:%.*]] = icmp sgt i32 [[NB21]], 12
; CHECK-NEXT: br i1 [[TMP64]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP65:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
-; CHECK-NEXT: [[TMP66:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP67]] = select i1 [[TMP66]], float [[TMP65]], float [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP66:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]])
+; CHECK-NEXT: [[TMP67:%.*]] = fcmp oeq float [[AX43]], [[AY44]]
+; CHECK-NEXT: [[TMP75]] = select i1 [[TMP67]], float [[TMP66]], float [[TMP21]]
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi float [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -1113,15 +1113,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd float [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], float [[AXP40]], float [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP75]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP83]] = call float @llvm.copysign.f32(float [[AX42]], float [[TMP21]])
+; CHECK-NEXT: br label %[[BB24]]
; CHECK: [[FREM_COMPUTE46]]:
-; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX43]])
+; CHECK-NEXT: [[TMP76:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX12]])
; CHECK-NEXT: [[TMP77:%.*]] = extractvalue { float, i32 } [[TMP76]], 0
; CHECK-NEXT: [[TMP78:%.*]] = extractvalue { float, i32 } [[TMP76]], 1
; CHECK-NEXT: [[EX48:%.*]] = sub i32 [[TMP78]], 1
; CHECK-NEXT: [[AX49:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP77]], i32 12)
-; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY44]])
+; CHECK-NEXT: [[TMP79:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY13]])
; CHECK-NEXT: [[TMP80:%.*]] = extractvalue { float, i32 } [[TMP79]], 0
; CHECK-NEXT: [[TMP81:%.*]] = extractvalue { float, i32 } [[TMP79]], 1
; CHECK-NEXT: [[EY50:%.*]] = sub i32 [[TMP81]], 1
@@ -1131,10 +1131,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP82:%.*]] = icmp sgt i32 [[NB52]], 12
; CHECK-NEXT: br i1 [[TMP82]], label %[[FREM_LOOP_BODY54:.*]], label %[[FREM_LOOP_EXIT55]]
; CHECK: [[FREM_ELSE47]]:
-; CHECK-NEXT: [[TMP83:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP21]])
-; CHECK-NEXT: [[TMP84:%.*]] = fcmp oeq float [[AX43]], [[AY44]]
-; CHECK-NEXT: [[TMP85]] = select i1 [[TMP84]], float [[TMP83]], float [[TMP21]]
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP84:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP11]])
+; CHECK-NEXT: [[TMP85:%.*]] = fcmp oeq float [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP93]] = select i1 [[TMP85]], float [[TMP84]], float [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY54]]:
; CHECK-NEXT: [[NB_IV56:%.*]] = phi i32 [ [[NB52]], %[[FREM_COMPUTE46]] ], [ [[NB_UPDATE64:%.*]], %[[FREM_LOOP_BODY54]] ]
; CHECK-NEXT: [[AX_LOOP_PHI57:%.*]] = phi float [ [[AX49]], %[[FREM_COMPUTE46]] ], [ [[AX_UPDATE63:%.*]], %[[FREM_LOOP_BODY54]] ]
@@ -1163,15 +1163,15 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP71:%.*]] = fadd float [[AX69]], [[AY51]]
; CHECK-NEXT: [[AX72:%.*]] = select i1 [[CLT70]], float [[AXP71]], float [[AX69]]
; CHECK-NEXT: [[AX73:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX72]], i32 [[EY50]])
-; CHECK-NEXT: [[TMP93]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP21]])
-; CHECK-NEXT: br label %[[BB24]]
+; CHECK-NEXT: [[TMP101]] = call float @llvm.copysign.f32(float [[AX73]], float [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE77]]:
-; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX74]])
+; CHECK-NEXT: [[TMP94:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AX]])
; CHECK-NEXT: [[TMP95:%.*]] = extractvalue { float, i32 } [[TMP94]], 0
; CHECK-NEXT: [[TMP96:%.*]] = extractvalue { float, i32 } [[TMP94]], 1
; CHECK-NEXT: [[EX79:%.*]] = sub i32 [[TMP96]], 1
; CHECK-NEXT: [[AX80:%.*]] = call float @llvm.ldexp.f32.i32(float [[TMP95]], i32 12)
-; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY75]])
+; CHECK-NEXT: [[TMP97:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[AY]])
; CHECK-NEXT: [[TMP98:%.*]] = extractvalue { float, i32 } [[TMP97]], 0
; CHECK-NEXT: [[TMP99:%.*]] = extractvalue { float, i32 } [[TMP97]], 1
; CHECK-NEXT: [[EY81:%.*]] = sub i32 [[TMP99]], 1
@@ -1181,10 +1181,10 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP100:%.*]] = icmp sgt i32 [[NB83]], 12
; CHECK-NEXT: br i1 [[TMP100]], label %[[FREM_LOOP_BODY85:.*]], label %[[FREM_LOOP_EXIT86]]
; CHECK: [[FREM_ELSE78]]:
-; CHECK-NEXT: [[TMP101:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP31]])
-; CHECK-NEXT: [[TMP102:%.*]] = fcmp oeq float [[AX74]], [[AY75]]
-; CHECK-NEXT: [[TMP103]] = select i1 [[TMP102]], float [[TMP101]], float [[TMP31]]
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP102:%.*]] = call float @llvm.copysign.f32(float 0.000000e+00, float [[TMP1]])
+; CHECK-NEXT: [[TMP103:%.*]] = fcmp oeq float [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP111]] = select i1 [[TMP103]], float [[TMP102]], float [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY85]]:
; CHECK-NEXT: [[NB_IV87:%.*]] = phi i32 [ [[NB83]], %[[FREM_COMPUTE77]] ], [ [[NB_UPDATE95:%.*]], %[[FREM_LOOP_BODY85]] ]
; CHECK-NEXT: [[AX_LOOP_PHI88:%.*]] = phi float [ [[AX80]], %[[FREM_COMPUTE77]] ], [ [[AX_UPDATE94:%.*]], %[[FREM_LOOP_BODY85]] ]
@@ -1213,8 +1213,8 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP102:%.*]] = fadd float [[AX100]], [[AY82]]
; CHECK-NEXT: [[AX103:%.*]] = select i1 [[CLT101]], float [[AXP102]], float [[AX100]]
; CHECK-NEXT: [[AX104:%.*]] = call float @llvm.ldexp.f32.i32(float [[AX103]], i32 [[EY81]])
-; CHECK-NEXT: [[TMP111]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP31]])
-; CHECK-NEXT: br label %[[BB34]]
+; CHECK-NEXT: [[TMP112]] = call float @llvm.copysign.f32(float [[AX104]], float [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -1236,9 +1236,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX:%.*]] = call double @llvm.fabs.f64(double [[TMP1]])
; CHECK-NEXT: [[AY:%.*]] = call double @llvm.fabs.f64(double [[TMP2]])
; CHECK-NEXT: [[TMP3:%.*]] = fcmp ogt double [[AX]], [[AY]]
-; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
+; CHECK-NEXT: br i1 [[TMP3]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
; CHECK: [[BB4:.*]]:
-; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP37:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP29:%.*]], %[[FREM_ELSE]] ]
+; CHECK-NEXT: [[RET:%.*]] = phi double [ [[TMP56:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP55:%.*]], %[[FREM_ELSE16]] ]
; CHECK-NEXT: [[TMP5:%.*]] = fcmp ueq double [[TMP2]], 0.000000e+00
; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], double 0x7FF8000000000000, double [[RET]]
; CHECK-NEXT: [[TMP7:%.*]] = call double @llvm.fabs.f64(double [[TMP1]])
@@ -1250,9 +1250,9 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AX12:%.*]] = call double @llvm.fabs.f64(double [[TMP11]])
; CHECK-NEXT: [[AY13:%.*]] = call double @llvm.fabs.f64(double [[TMP12]])
; CHECK-NEXT: [[TMP13:%.*]] = fcmp ogt double [[AX12]], [[AY13]]
-; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE15:.*]], label %[[FREM_ELSE16:.*]]
+; CHECK-NEXT: br i1 [[TMP13]], label %[[FREM_COMPUTE:.*]], label %[[FREM_ELSE:.*]]
; CHECK: [[BB14:.*]]:
-; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP55:%.*]], %[[FREM_LOOP_EXIT24:.*]] ], [ [[TMP47:%.*]], %[[FREM_ELSE16]] ]
+; CHECK-NEXT: [[RET14:%.*]] = phi double [ [[TMP45:%.*]], %[[FREM_LOOP_EXIT:.*]] ], [ [[TMP37:%.*]], %[[FREM_ELSE]] ]
; CHECK-NEXT: [[TMP15:%.*]] = fcmp ueq double [[TMP12]], 0.000000e+00
; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[TMP15]], double 0x7FF8000000000000, double [[RET14]]
; CHECK-NEXT: [[TMP17:%.*]] = call double @llvm.fabs.f64(double [[TMP11]])
@@ -1262,12 +1262,12 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: store <2 x double> [[R2]], ptr addrspace(1) [[OUT]], align 16
; CHECK-NEXT: ret void
; CHECK: [[FREM_COMPUTE]]:
-; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]])
+; CHECK-NEXT: [[TMP20:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]])
; CHECK-NEXT: [[TMP21:%.*]] = extractvalue { double, i32 } [[TMP20]], 0
; CHECK-NEXT: [[TMP22:%.*]] = extractvalue { double, i32 } [[TMP20]], 1
; CHECK-NEXT: [[EX:%.*]] = sub i32 [[TMP22]], 1
; CHECK-NEXT: [[AX1:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP21]], i32 26)
-; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]])
+; CHECK-NEXT: [[TMP23:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]])
; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { double, i32 } [[TMP23]], 0
; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { double, i32 } [[TMP23]], 1
; CHECK-NEXT: [[EY:%.*]] = sub i32 [[TMP25]], 1
@@ -1277,10 +1277,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 [[NB]], 26
; CHECK-NEXT: br i1 [[TMP26]], label %[[FREM_LOOP_BODY:.*]], label %[[FREM_LOOP_EXIT]]
; CHECK: [[FREM_ELSE]]:
-; CHECK-NEXT: [[TMP27:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]])
-; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq double [[AX]], [[AY]]
-; CHECK-NEXT: [[TMP29]] = select i1 [[TMP28]], double [[TMP27]], double [[TMP1]]
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP28:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]])
+; CHECK-NEXT: [[TMP29:%.*]] = fcmp oeq double [[AX12]], [[AY13]]
+; CHECK-NEXT: [[TMP37]] = select i1 [[TMP29]], double [[TMP28]], double [[TMP11]]
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_LOOP_BODY]]:
; CHECK-NEXT: [[NB_IV:%.*]] = phi i32 [ [[NB]], %[[FREM_COMPUTE]] ], [ [[NB_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
; CHECK-NEXT: [[AX_LOOP_PHI:%.*]] = phi double [ [[AX1]], %[[FREM_COMPUTE]] ], [ [[AX_UPDATE:%.*]], %[[FREM_LOOP_BODY]] ]
@@ -1309,15 +1309,15 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP9:%.*]] = fadd double [[AX7]], [[AY2]]
; CHECK-NEXT: [[AX10:%.*]] = select i1 [[CLT8]], double [[AXP9]], double [[AX7]]
; CHECK-NEXT: [[AX11:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX10]], i32 [[EY]])
-; CHECK-NEXT: [[TMP37]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP1]])
-; CHECK-NEXT: br label %[[BB4]]
+; CHECK-NEXT: [[TMP45]] = call double @llvm.copysign.f64(double [[AX11]], double [[TMP11]])
+; CHECK-NEXT: br label %[[BB14]]
; CHECK: [[FREM_COMPUTE15]]:
-; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX12]])
+; CHECK-NEXT: [[TMP38:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AX]])
; CHECK-NEXT: [[TMP39:%.*]] = extractvalue { double, i32 } [[TMP38]], 0
; CHECK-NEXT: [[TMP40:%.*]] = extractvalue { double, i32 } [[TMP38]], 1
; CHECK-NEXT: [[EX17:%.*]] = sub i32 [[TMP40]], 1
; CHECK-NEXT: [[AX18:%.*]] = call double @llvm.ldexp.f64.i32(double [[TMP39]], i32 26)
-; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY13]])
+; CHECK-NEXT: [[TMP41:%.*]] = call { double, i32 } @llvm.frexp.f64.i32(double [[AY]])
; CHECK-NEXT: [[TMP42:%.*]] = extractvalue { double, i32 } [[TMP41]], 0
; CHECK-NEXT: [[TMP43:%.*]] = extractvalue { double, i32 } [[TMP41]], 1
; CHECK-NEXT: [[EY19:%.*]] = sub i32 [[TMP43]], 1
@@ -1327,10 +1327,10 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[TMP44:%.*]] = icmp sgt i32 [[NB21]], 26
; CHECK-NEXT: br i1 [[TMP44]], label %[[FREM_LOOP_BODY23:.*]], label %[[FREM_LOOP_EXIT24]]
; CHECK: [[FREM_ELSE16]]:
-; CHECK-NEXT: [[TMP45:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP11]])
-; CHECK-NEXT: [[TMP46:%.*]] = fcmp oeq double [[AX12]], [[AY13]]
-; CHECK-NEXT: [[TMP47]] = select i1 [[TMP46]], double [[TMP45]], double [[TMP11]]
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP46:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[TMP1]])
+; CHECK-NEXT: [[TMP47:%.*]] = fcmp oeq double [[AX]], [[AY]]
+; CHECK-NEXT: [[TMP55]] = select i1 [[TMP47]], double [[TMP46]], double [[TMP1]]
+; CHECK-NEXT: br label %[[BB4]]
; CHECK: [[FREM_LOOP_BODY23]]:
; CHECK-NEXT: [[NB_IV25:%.*]] = phi i32 [ [[NB21]], %[[FREM_COMPUTE15]] ], [ [[NB_UPDATE33:%.*]], %[[FREM_LOOP_BODY23]] ]
; CHECK-NEXT: [[AX_LOOP_PHI26:%.*]] = phi double [ [[AX18]], %[[FREM_COMPUTE15]] ], [ [[AX_UPDATE32:%.*]], %[[FREM_LOOP_BODY23]] ]
@@ -1359,8 +1359,8 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
; CHECK-NEXT: [[AXP40:%.*]] = fadd double [[AX38]], [[AY20]]
; CHECK-NEXT: [[AX41:%.*]] = select i1 [[CLT39]], double [[AXP40]], double [[AX38]]
; CHECK-NEXT: [[AX42:%.*]] = call double @llvm.ldexp.f64.i32(double [[AX41]], i32 [[EY19]])
-; CHECK-NEXT: [[TMP55]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP11]])
-; CHECK-NEXT: br label %[[BB14]]
+; CHECK-NEXT: [[TMP56]] = call double @llvm.copysign.f64(double [[AX42]], double [[TMP1]])
+; CHECK-NEXT: br label %[[BB4]]
;
ptr addrspace(1) %in2) {
%gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index fae1365..e1d39fd 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -99,12 +99,15 @@ define float @test_3(i32 %a, i32 %b) {
ret float %p
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: @test_4(
; CHECK-NEXT: [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], splat (i32 1073741823)
; CHECK-NEXT: [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], splat (i32 1073741823)
-; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT: [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT: [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x double>
+; CHECK-NEXT: [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x double>
+; CHECK-NEXT: [[RES:%.*]] = fadd <4 x double> [[A_AND_FP]], [[B_AND_FP]]
; CHECK-NEXT: ret <4 x double> [[RES]]
;
; Drop two highest bits to guarantee that %a + %b doesn't overflow
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index 702bbbb..57184ea 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -1063,6 +1063,25 @@ define float @negzero_check_on_constant_for_si_fmul(i1 %c, i1 %.b, ptr %g_2345)
ret float %mul3.i.i
}
+; Don't perform the fold on vector operations, as the integer op may be
+; much more expensive than the float op in that case.
+define <2 x half> @test_ui_ui_i8_mul_vec(<2 x i8> noundef %x_in, <2 x i8> noundef %y_in) {
+; CHECK-LABEL: @test_ui_ui_i8_mul_vec(
+; CHECK-NEXT: [[X:%.*]] = and <2 x i8> [[X_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[Y_IN:%.*]], splat (i8 15)
+; CHECK-NEXT: [[XF:%.*]] = uitofp nneg <2 x i8> [[X]] to <2 x half>
+; CHECK-NEXT: [[YF:%.*]] = uitofp nneg <2 x i8> [[Y]] to <2 x half>
+; CHECK-NEXT: [[R:%.*]] = fmul <2 x half> [[XF]], [[YF]]
+; CHECK-NEXT: ret <2 x half> [[R]]
+;
+ %x = and <2 x i8> %x_in, splat (i8 15)
+ %y = and <2 x i8> %y_in, splat (i8 15)
+ %xf = uitofp <2 x i8> %x to <2 x half>
+ %yf = uitofp <2 x i8> %y to <2 x half>
+ %r = fmul <2 x half> %xf, %yf
+ ret <2 x half> %r
+}
+
define <2 x float> @nonzero_check_on_constant_for_si_fmul_vec_w_poison(i1 %c, i1 %.b, ptr %g_2345) {
; CHECK-LABEL: @nonzero_check_on_constant_for_si_fmul_vec_w_poison(
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i32 65529, i32 53264
@@ -1091,8 +1110,9 @@ define <2 x float> @nonzero_check_on_constant_for_si_fmul_nz_vec_w_poison(i1 %c,
; CHECK-NEXT: [[CONV_I_V:%.*]] = insertelement <2 x i16> poison, i16 [[CONV_I_S]], i64 0
; CHECK-NEXT: [[CONV_I:%.*]] = shufflevector <2 x i16> [[CONV_I_V]], <2 x i16> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[MUL3_I_I:%.*]] = sitofp <2 x i16> [[CONV_I]] to <2 x float>
+; CHECK-NEXT: [[MUL3_I_I1:%.*]] = fmul <2 x float> [[MUL3_I_I]], <float poison, float 1.000000e+00>
; CHECK-NEXT: store i32 [[SEL]], ptr [[G_2345:%.*]], align 4
-; CHECK-NEXT: ret <2 x float> [[MUL3_I_I]]
+; CHECK-NEXT: ret <2 x float> [[MUL3_I_I1]]
;
%sel = select i1 %c, i32 65529, i32 53264
%conv.i.s = trunc i32 %sel to i16
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 7b0b152..ffaa8b1 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -23,10 +23,7 @@ define i64 @ptrtoaddr_inttoptr_arg(i64 %a) {
define i32 @ptrtoaddr_inttoptr_arg_addrsize(i32 %a) {
; CHECK-LABEL: define i32 @ptrtoaddr_inttoptr_arg_addrsize(
; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[A]] to i64
-; CHECK-NEXT: [[TOPTR:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(1)
-; CHECK-NEXT: [[TOADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[TOPTR]] to i32
-; CHECK-NEXT: ret i32 [[TOADDR]]
+; CHECK-NEXT: ret i32 [[A]]
;
%toptr = inttoptr i32 %a to ptr addrspace(1)
%toaddr = ptrtoaddr ptr addrspace(1) %toptr to i32
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 9ed2240..9357adf 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -273,3 +273,106 @@ loop:
exit:
ret void
}
+
+define void @ld_div2_ld_scevunknown_nonuniform(ptr %src.a, ptr noalias %src.b, ptr noalias %dst) {
+; CHECK-LABEL: define void @ld_div2_ld_scevunknown_nonuniform
+; CHECK-SAME: (ptr [[SRC_A:%.*]], ptr noalias [[SRC_B:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP2]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[SRC_A]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i64, ptr [[TMP8]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = load i64, ptr [[TMP9]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = load i64, ptr [[TMP10]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = load i64, ptr [[TMP11]], align 4
+; CHECK-NEXT: [[TMP20:%.*]] = load i64, ptr [[TMP12]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = load i64, ptr [[TMP13]], align 4
+; CHECK-NEXT: [[TMP22:%.*]] = load i64, ptr [[TMP14]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = load i64, ptr [[TMP15]], align 4
+; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 1
+; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 2
+; CHECK-NEXT: [[TMP27:%.*]] = insertelement <8 x i64> [[TMP26]], i64 [[TMP19]], i32 3
+; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x i64> [[TMP27]], i64 [[TMP20]], i32 4
+; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x i64> [[TMP28]], i64 [[TMP21]], i32 5
+; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x i64> [[TMP29]], i64 [[TMP22]], i32 6
+; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x i64> [[TMP30]], i64 [[TMP23]], i32 7
+; CHECK-NEXT: [[TMP32:%.*]] = udiv <8 x i64> [[TMP31]], splat (i64 2)
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i64> [[TMP32]], i32 0
+; CHECK-NEXT: [[TMP34:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP33]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i64> [[TMP32]], i32 1
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP35]]
+; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i64> [[TMP32]], i32 2
+; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i64> [[TMP32]], i32 3
+; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP39]]
+; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i64> [[TMP32]], i32 4
+; CHECK-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP41]]
+; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i64> [[TMP32]], i32 5
+; CHECK-NEXT: [[TMP44:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP43]]
+; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i64> [[TMP32]], i32 6
+; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP45]]
+; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i64> [[TMP32]], i32 7
+; CHECK-NEXT: [[TMP48:%.*]] = getelementptr i32, ptr [[SRC_B]], i64 [[TMP47]]
+; CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP34]], align 4
+; CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr [[TMP36]], align 4
+; CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP38]], align 4
+; CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr [[TMP40]], align 4
+; CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP42]], align 4
+; CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr [[TMP44]], align 4
+; CHECK-NEXT: [[TMP55:%.*]] = load i32, ptr [[TMP46]], align 4
+; CHECK-NEXT: [[TMP56:%.*]] = load i32, ptr [[TMP48]], align 4
+; CHECK-NEXT: [[TMP57:%.*]] = insertelement <8 x i32> poison, i32 [[TMP49]], i32 0
+; CHECK-NEXT: [[TMP58:%.*]] = insertelement <8 x i32> [[TMP57]], i32 [[TMP50]], i32 1
+; CHECK-NEXT: [[TMP59:%.*]] = insertelement <8 x i32> [[TMP58]], i32 [[TMP51]], i32 2
+; CHECK-NEXT: [[TMP60:%.*]] = insertelement <8 x i32> [[TMP59]], i32 [[TMP52]], i32 3
+; CHECK-NEXT: [[TMP61:%.*]] = insertelement <8 x i32> [[TMP60]], i32 [[TMP53]], i32 4
+; CHECK-NEXT: [[TMP62:%.*]] = insertelement <8 x i32> [[TMP61]], i32 [[TMP54]], i32 5
+; CHECK-NEXT: [[TMP63:%.*]] = insertelement <8 x i32> [[TMP62]], i32 [[TMP55]], i32 6
+; CHECK-NEXT: [[TMP64:%.*]] = insertelement <8 x i32> [[TMP63]], i32 [[TMP56]], i32 7
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr i32, ptr [[DST]], i64 [[TMP0]]
+; CHECK-NEXT: store <8 x i32> [[TMP64]], ptr [[TMP65]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: br label [[SCALAR_PH:%.*]]
+; CHECK: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %gep.a = getelementptr i32, ptr %src.a, i64 %iv
+ %load.a = load i64, ptr %gep.a
+ %d = udiv i64 %load.a, 2
+ %gep.b = getelementptr i32, ptr %src.b, i64 %d
+ %load.b = load i32, ptr %gep.b
+ %gep.dst = getelementptr i32, ptr %dst, i64 %iv
+ store i32 %load.b, ptr %gep.dst
+ %iv.next = add i64 %iv, 1
+ %exit.cond = icmp eq i64 %iv, 1000
+ br i1 %exit.cond, label %exit, label %loop
+
+exit:
+ ret void
+}
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
new file mode 100644
index 0000000..d281905
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
@@ -0,0 +1,539 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:128:128' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:64:64' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes=lower-matrix-intrinsics,instcombine -data-layout='p:32:32' -fuse-matrix-use-loops=false -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -verify-dom-info %s -S | FileCheck %s --check-prefix=PTR32
+
+; REQUIRES: aarch64-registered-target
+
+; See the comment in `data-layout.ll` for an explanation.
+
+target triple = "aarch64-unknown-unknown"
+
+define void @multiply(ptr %A, ptr %B, ptr %C) {
+; PTR128-LABEL: @multiply(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i128
+; PTR128-NEXT: [[STORE_END:%.*]] = add nuw nsw i128 [[STORE_BEGIN]], 128
+; PTR128-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i128
+; PTR128-NEXT: [[TMP0:%.*]] = icmp ugt i128 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR128-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR128: alias_cont:
+; PTR128-NEXT: [[LOAD_END:%.*]] = add nuw nsw i128 [[LOAD_BEGIN]], 128
+; PTR128-NEXT: [[TMP1:%.*]] = icmp ugt i128 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR128-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR128: copy:
+; PTR128-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT: br label [[NO_ALIAS]]
+; PTR128: no_alias:
+; PTR128-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR128-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i128
+; PTR128-NEXT: [[STORE_END5:%.*]] = add nuw nsw i128 [[STORE_BEGIN4]], 128
+; PTR128-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i128
+; PTR128-NEXT: [[TMP4:%.*]] = icmp ugt i128 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR128-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR128: alias_cont1:
+; PTR128-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i128 [[LOAD_BEGIN6]], 128
+; PTR128-NEXT: [[TMP5:%.*]] = icmp ugt i128 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR128-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR128: copy2:
+; PTR128-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR128-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR128-NEXT: br label [[NO_ALIAS3]]
+; PTR128: no_alias3:
+; PTR128-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR128-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR128-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR128-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR128-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR128-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR128-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR128-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR128-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR128-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR128-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR128-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR128-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i128 32
+; PTR128-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR128-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR128-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR128-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR128-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i128 32
+; PTR128-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR128-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR128-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR128-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR128-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR128-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR128-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i128 16
+; PTR128-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR128-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i128 48
+; PTR128-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR128-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR128-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR128-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR128-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i128 16
+; PTR128-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR128-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i128 48
+; PTR128-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR128-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR128-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i128 32
+; PTR128-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR128-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR128-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR128-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR128-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR128-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR128-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i128 64
+; PTR128-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR128-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i128 96
+; PTR128-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR128-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR128-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR128-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR128-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR128-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR128-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i128 64
+; PTR128-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR128-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i128 96
+; PTR128-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR128-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i128 16
+; PTR128-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR128-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i128 48
+; PTR128-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR128-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i128 64
+; PTR128-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR128-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i128 96
+; PTR128-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR128-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR128-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR128-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR128-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i128 80
+; PTR128-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR128-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i128 112
+; PTR128-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR128-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i128 80
+; PTR128-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR128-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i128 112
+; PTR128-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR128-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR128-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR128-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR128-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR128-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR128-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR128-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i128 80
+; PTR128-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR128-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i128 112
+; PTR128-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR128-NEXT: ret void
+;
+; PTR64-LABEL: @multiply(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; PTR64-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 128
+; PTR64-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; PTR64-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR64: alias_cont:
+; PTR64-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 128
+; PTR64-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR64-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR64: copy:
+; PTR64-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS]]
+; PTR64: no_alias:
+; PTR64-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR64-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i64
+; PTR64-NEXT: [[STORE_END5:%.*]] = add nuw nsw i64 [[STORE_BEGIN4]], 128
+; PTR64-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i64
+; PTR64-NEXT: [[TMP4:%.*]] = icmp ugt i64 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR64-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR64: alias_cont1:
+; PTR64-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i64 [[LOAD_BEGIN6]], 128
+; PTR64-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR64-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR64: copy2:
+; PTR64-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR64-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR64-NEXT: br label [[NO_ALIAS3]]
+; PTR64: no_alias3:
+; PTR64-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR64-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR64-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR64-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR64-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR64-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR64-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR64-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR64-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR64-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR64-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
+; PTR64-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR64-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR64-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR64-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR64-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
+; PTR64-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR64-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR64-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR64-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR64-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR64-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR64-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
+; PTR64-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR64-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
+; PTR64-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR64-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR64-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR64-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR64-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
+; PTR64-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR64-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
+; PTR64-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR64-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR64-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
+; PTR64-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR64-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR64-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR64-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR64-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR64-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR64-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
+; PTR64-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR64-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
+; PTR64-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR64-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR64-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR64-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR64-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR64-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR64-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
+; PTR64-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR64-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
+; PTR64-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR64-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
+; PTR64-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR64-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
+; PTR64-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR64-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
+; PTR64-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR64-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
+; PTR64-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR64-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR64-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR64-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR64-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
+; PTR64-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR64-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
+; PTR64-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR64-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
+; PTR64-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR64-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
+; PTR64-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR64-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR64-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR64-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR64-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR64-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR64-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR64-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
+; PTR64-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR64-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
+; PTR64-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR64-NEXT: ret void
+;
+; PTR32-LABEL: @multiply(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i32
+; PTR32-NEXT: [[STORE_END:%.*]] = add nuw nsw i32 [[STORE_BEGIN]], 128
+; PTR32-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[A:%.*]] to i32
+; PTR32-NEXT: [[TMP0:%.*]] = icmp ugt i32 [[STORE_END]], [[LOAD_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
+; PTR32: alias_cont:
+; PTR32-NEXT: [[LOAD_END:%.*]] = add nuw nsw i32 [[LOAD_BEGIN]], 128
+; PTR32-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[LOAD_END]], [[STORE_BEGIN]]
+; PTR32-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
+; PTR32: copy:
+; PTR32-NEXT: [[TMP2:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP2]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS]]
+; PTR32: no_alias:
+; PTR32-NEXT: [[TMP3:%.*]] = phi ptr [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
+; PTR32-NEXT: [[STORE_BEGIN4:%.*]] = ptrtoint ptr [[C]] to i32
+; PTR32-NEXT: [[STORE_END5:%.*]] = add nuw nsw i32 [[STORE_BEGIN4]], 128
+; PTR32-NEXT: [[LOAD_BEGIN6:%.*]] = ptrtoint ptr [[A]] to i32
+; PTR32-NEXT: [[TMP4:%.*]] = icmp ugt i32 [[STORE_END5]], [[LOAD_BEGIN6]]
+; PTR32-NEXT: br i1 [[TMP4]], label [[ALIAS_CONT1:%.*]], label [[NO_ALIAS3:%.*]]
+; PTR32: alias_cont1:
+; PTR32-NEXT: [[LOAD_END7:%.*]] = add nuw nsw i32 [[LOAD_BEGIN6]], 128
+; PTR32-NEXT: [[TMP5:%.*]] = icmp ugt i32 [[LOAD_END7]], [[STORE_BEGIN4]]
+; PTR32-NEXT: br i1 [[TMP5]], label [[COPY2:%.*]], label [[NO_ALIAS3]]
+; PTR32: copy2:
+; PTR32-NEXT: [[TMP6:%.*]] = alloca [16 x double], align 8
+; PTR32-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 8 dereferenceable(128) [[TMP6]], ptr noundef nonnull align 8 dereferenceable(128) [[A]], i64 128, i1 false)
+; PTR32-NEXT: br label [[NO_ALIAS3]]
+; PTR32: no_alias3:
+; PTR32-NEXT: [[TMP7:%.*]] = phi ptr [ [[A]], [[NO_ALIAS]] ], [ [[A]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
+; PTR32-NEXT: [[SPLAT_SPLAT14:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT14]], <2 x double> [[TMP8]])
+; PTR32-NEXT: [[SPLAT_SPLAT17:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
+; PTR32-NEXT: [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
+; PTR32-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
+; PTR32-NEXT: [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
+; PTR32-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
+; PTR32-NEXT: [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
+; PTR32-NEXT: [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP15:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP14]])
+; PTR32-NEXT: [[SPLAT_SPLAT37:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP16:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT37]], <2 x double> [[TMP11]])
+; PTR32-NEXT: [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
+; PTR32-NEXT: store <2 x double> [[TMP15]], ptr [[C]], align 8
+; PTR32-NEXT: [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i32 32
+; PTR32-NEXT: store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
+; PTR32-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
+; PTR32-NEXT: [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
+; PTR32-NEXT: [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
+; PTR32-NEXT: [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i32 32
+; PTR32-NEXT: [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
+; PTR32-NEXT: [[SPLAT_SPLAT53:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT53]], <2 x double> [[TMP19]])
+; PTR32-NEXT: [[SPLAT_SPLAT56:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
+; PTR32-NEXT: [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
+; PTR32-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; PTR32-NEXT: [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
+; PTR32-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
+; PTR32-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
+; PTR32-NEXT: [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i32 48
+; PTR32-NEXT: [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
+; PTR32-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP25]])
+; PTR32-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
+; PTR32-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
+; PTR32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i32 16
+; PTR32-NEXT: store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
+; PTR32-NEXT: [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i32 48
+; PTR32-NEXT: store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
+; PTR32-NEXT: [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
+; PTR32-NEXT: [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i32 32
+; PTR32-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
+; PTR32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
+; PTR32-NEXT: [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
+; PTR32-NEXT: [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP31]])
+; PTR32-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
+; PTR32-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
+; PTR32-NEXT: [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i32 64
+; PTR32-NEXT: [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
+; PTR32-NEXT: [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i32 96
+; PTR32-NEXT: [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
+; PTR32-NEXT: [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
+; PTR32-NEXT: [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
+; PTR32-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP38:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP37]])
+; PTR32-NEXT: [[SPLAT_SPLAT115:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
+; PTR32-NEXT: [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
+; PTR32-NEXT: [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i32 64
+; PTR32-NEXT: store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
+; PTR32-NEXT: [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i32 96
+; PTR32-NEXT: store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
+; PTR32-NEXT: [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; PTR32-NEXT: [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
+; PTR32-NEXT: [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i32 48
+; PTR32-NEXT: [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
+; PTR32-NEXT: [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i32 64
+; PTR32-NEXT: [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
+; PTR32-NEXT: [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i32 96
+; PTR32-NEXT: [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
+; PTR32-NEXT: [[SPLAT_SPLAT131:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP45:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT131]], <2 x double> [[TMP44]])
+; PTR32-NEXT: [[SPLAT_SPLAT134:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
+; PTR32-NEXT: [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
+; PTR32-NEXT: [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i32 80
+; PTR32-NEXT: [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
+; PTR32-NEXT: [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i32 112
+; PTR32-NEXT: [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
+; PTR32-NEXT: [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i32 80
+; PTR32-NEXT: [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
+; PTR32-NEXT: [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i32 112
+; PTR32-NEXT: [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
+; PTR32-NEXT: [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
+; PTR32-NEXT: [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP51:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP50]])
+; PTR32-NEXT: [[SPLAT_SPLAT154:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> zeroinitializer
+; PTR32-NEXT: [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
+; PTR32-NEXT: [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
+; PTR32-NEXT: [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
+; PTR32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i32 80
+; PTR32-NEXT: store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
+; PTR32-NEXT: [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i32 112
+; PTR32-NEXT: store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
+; PTR32-NEXT: ret void
+;
+entry:
+ %a = load <16 x double>, ptr %A, align 8
+ %c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %a, i32 4, i32 4, i32 4)
+ store <16 x double> %c, ptr %C, align 8
+ ret void
+}
+
+declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
new file mode 100644
index 0000000..87def6b
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/data-layout.ll
@@ -0,0 +1,312 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:128:128' -S < %s | FileCheck %s --check-prefix=PTR128
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:64:64' -S < %s | FileCheck %s --check-prefix=PTR64
+; RUN: opt -passes='lower-matrix-intrinsics' -data-layout='p:32:32' -S < %s | FileCheck %s --check-prefix=PTR32
+
+; To properly support the matrix intrinsics on, e.g., 32-bit platforms (without
+; the need to emit `libc` calls), we perform strided index calculations using
+; the same pointer bit-width as the matrix pointers, as determined by the data
+; layout. To verify this behaviour, this test runs several strided loads and
+; stores through the lowering pass with (32|64|128)-bit pointers, and verifies
+; the generated code extends / truncates strides accordingly. Similarly,
+; `data-layout-multiply-fused.ll` adopts this approach to verify the same
+; behaviour for index calculations emitted while lowering fused matrix
+; multiplies.
+
+define <9 x double> @strided_load_3x3_i128(ptr %in, i128 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i128(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE:%.*]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i128(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i64
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i128(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i128 [[STRIDE:%.*]] to i32
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i128(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i128(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr %in, i128 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i64(ptr %in, i64 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i64(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i64 [[STRIDE:%.*]] to i128
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE:%.*]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[STRIDE_CAST:%.*]] = trunc i64 [[STRIDE:%.*]] to i32
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE_CAST]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i64(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i64(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr %in, i64 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_i32(ptr %in, i32 %stride) {
+; PTR128-LABEL: @strided_load_3x3_i32(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i128
+; PTR128-NEXT: [[VEC_START:%.*]] = mul i128 0, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i128 [[VEC_START]]
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_START1:%.*]] = mul i128 1, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START1]]
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[VEC_START4:%.*]] = mul i128 2, [[STRIDE_CAST]]
+; PTR128-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i128 [[VEC_START4]]
+; PTR128-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; PTR64-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[VEC_START4:%.*]] = mul i64 2, [[STRIDE_CAST]]
+; PTR64-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START4]]
+; PTR64-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[VEC_START4:%.*]] = mul i32 2, [[STRIDE]]
+; PTR32-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START4]]
+; PTR32-NEXT: [[COL_LOAD6:%.*]] = load <3 x double>, ptr [[VEC_GEP5]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD3]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD6]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 %stride, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+define <9 x double> @strided_load_3x3_const_stride_i32(ptr %in) {
+; PTR128-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR128-NEXT: entry:
+; PTR128-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR128-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i128 16
+; PTR128-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR128-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i128 32
+; PTR128-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR128-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR128-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR128-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR128-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR64-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR64-NEXT: entry:
+; PTR64-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR64-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 16
+; PTR64-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR64-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 32
+; PTR64-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR64-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR64-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR64-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR64-NEXT: ret <9 x double> [[TMP2]]
+;
+; PTR32-LABEL: @strided_load_3x3_const_stride_i32(
+; PTR32-NEXT: entry:
+; PTR32-NEXT: [[COL_LOAD:%.*]] = load <3 x double>, ptr [[IN:%.*]], align 8
+; PTR32-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i32 16
+; PTR32-NEXT: [[COL_LOAD1:%.*]] = load <3 x double>, ptr [[VEC_GEP]], align 8
+; PTR32-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 32
+; PTR32-NEXT: [[COL_LOAD3:%.*]] = load <3 x double>, ptr [[VEC_GEP2]], align 8
+; PTR32-NEXT: [[TMP0:%.*]] = shufflevector <3 x double> [[COL_LOAD]], <3 x double> [[COL_LOAD1]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; PTR32-NEXT: [[TMP1:%.*]] = shufflevector <3 x double> [[COL_LOAD3]], <3 x double> poison, <6 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison>
+; PTR32-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP0]], <6 x double> [[TMP1]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; PTR32-NEXT: ret <9 x double> [[TMP2]]
+;
+entry:
+ %load = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %in, i32 16, i1 false, i32 3, i32 3)
+ ret <9 x double> %load
+}
+
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i128(ptr, i128, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i64(ptr, i64, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
index ae7da19..abc4705 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-load-double.ll
@@ -62,11 +62,12 @@ declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32,
define <8 x double> @strided_load_4x2_stride_i32(ptr %in, i32 %stride) {
; CHECK-LABEL: @strided_load_4x2_stride_i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <4 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START1:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i32 [[VEC_START1]]
+; CHECK-NEXT: [[VEC_START1:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP2:%.*]] = getelementptr double, ptr [[IN]], i64 [[VEC_START1]]
; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <4 x double>, ptr [[VEC_GEP2]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x double> [[COL_LOAD]], <4 x double> [[COL_LOAD4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: ret <8 x double> [[TMP0]]
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
index 28e9cdb..81b8507 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/strided-store-double.ll
@@ -34,11 +34,12 @@ define void @strided_store_3x2_nonconst_i32_stride(<6 x double> %in, i32 %stride
; CHECK-LABEL: @strided_store_3x2_nonconst_i32_stride(
; CHECK-NEXT: [[SPLIT:%.*]] = shufflevector <6 x double> [[IN:%.*]], <6 x double> poison, <3 x i32> <i32 0, i32 1, i32 2>
; CHECK-NEXT: [[SPLIT1:%.*]] = shufflevector <6 x double> [[IN]], <6 x double> poison, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT: [[VEC_START:%.*]] = mul i32 0, [[STRIDE:%.*]]
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i32 [[VEC_START]]
+; CHECK-NEXT: [[STRIDE_CAST:%.*]] = zext i32 [[STRIDE:%.*]] to i64
+; CHECK-NEXT: [[VEC_START:%.*]] = mul i64 0, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT:%.*]], i64 [[VEC_START]]
; CHECK-NEXT: store <3 x double> [[SPLIT]], ptr [[VEC_GEP]], align 8
-; CHECK-NEXT: [[VEC_START2:%.*]] = mul i32 1, [[STRIDE]]
-; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i32 [[VEC_START2]]
+; CHECK-NEXT: [[VEC_START2:%.*]] = mul i64 1, [[STRIDE_CAST]]
+; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[OUT]], i64 [[VEC_START2]]
; CHECK-NEXT: store <3 x double> [[SPLIT1]], ptr [[VEC_GEP3]], align 8
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/PGOProfile/data-access-profile.ll b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
index 29198f34..205184b 100644
--- a/llvm/test/Transforms/PGOProfile/data-access-profile.ll
+++ b/llvm/test/Transforms/PGOProfile/data-access-profile.ll
@@ -3,55 +3,72 @@
; RUN: rm -rf %t && split-file %s %t && cd %t
-;; Read a text profile and merge it into indexed profile.
+;; Read text profiles and merge them into indexed profiles.
; RUN: llvm-profdata merge --memprof-version=4 memprof.yaml -o memprof.profdata
+; RUN: llvm-profdata merge --memprof-version=4 memprof-no-dap.yaml -o memprof-no-dap.profdata
;; Run optimizer pass on an IR module without IR functions, and test that global
;; variables in the module could be annotated (i.e., no early return),
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S funcless-module.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
;; Run optimizer pass on the IR, and check the section prefix.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' -memprof-annotate-static-data-prefix \
-; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,PREFIX,STAT
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefixes=LOG,IR,STAT
-;; Run optimizer pass without explicitly setting -memprof-annotate-static-data-prefix.
-;; The output text IR shouldn't have `section_prefix`
+;; Run memprof without providing memprof data. Test that IR has module flag
+;; `EnableDataAccessProf` as 0.
+; RUN: opt -passes='memprof-use<profile-filename=memprof-no-dap.profdata>' -memprof-annotate-static-data-prefix \
+; RUN: -debug-only=memprof -stats -S input.ll -o - 2>&1 | FileCheck %s --check-prefix=FLAG
+
+;; Run memprof without explicitly setting -memprof-annotate-static-data-prefix.
+;; The output text IR shouldn't have `section_prefix` or EnableDataAccessProf module flag.
; RUN: opt -passes='memprof-use<profile-filename=memprof.profdata>' \
-; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --implicit-check-not="section_prefix"
+; RUN: -debug-only=memprof -stats -S input.ll -o - | FileCheck %s --check-prefix=FLAGLESS --implicit-check-not="section_prefix"
; LOG: Skip annotating string literal .str
; LOG: Global variable var1 is annotated as hot
; LOG: Global variable var2.llvm.125 is annotated as hot
; LOG: Global variable bar is not annotated
; LOG: Global variable foo is annotated as unlikely
-; LOG: Global variable var3 has explicit section name. Skip annotating.
-; LOG: Global variable var4 has explicit section name. Skip annotating.
+; LOG: Skip annotation for var3 due to explicit section name.
+; LOG: Skip annotation for var4 due to explicit section name.
+; LOG: Skip annotation for llvm.fake_var due to name starts with `llvm.`.
+; LOG: Skip annotation for qux due to linker declaration.
;; String literals are not annotated.
-; PREFIX: @.str = unnamed_addr constant [5 x i8] c"abcde"
-; PREFIX-NOT: section_prefix
-; PREFIX: @var1 = global i32 123, !section_prefix !0
+; IR: @.str = unnamed_addr constant [5 x i8] c"abcde"
+; IR-NOT: section_prefix
+; IR: @var1 = global i32 123, !section_prefix !0
;; @var.llvm.125 will be canonicalized to @var2 for profile look-up.
-; PREFIX-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
+; IR-NEXT: @var2.llvm.125 = global i64 0, !section_prefix !0
;; @bar is not seen in hot symbol or known symbol set, so it won't get a section
;; prefix. Test this by testing that there is no section_prefix between @bar and
;; @foo.
-; PREFIX-NEXT: @bar = global i16 3
-; PREFIX-NOT: !section_prefix
+; IR-NEXT: @bar = global i16 3
+; IR-NOT: !section_prefix
;; @foo is unlikely.
-; PREFIX-NEXT: @foo = global i8 2, !section_prefix !1
+; IR-NEXT: @foo = global i8 2, !section_prefix !1
+
+; IR-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
+; IR-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+
+; IR: @llvm.fake_var = global i32 123
+; IR-NOT: !section_prefix
+; IR: @qux = external global i64
+; IR-NOT: !section_prefix
-; PREFIX-NEXT: @var3 = constant [2 x i32] [i32 12345, i32 6789], section "sec1"
-; PREFIX-NEXT: @var4 = constant [1 x i64] [i64 98765] #0
+; IR: attributes #0 = { "rodata-section"="sec2" }
-; PREFIX: attributes #0 = { "rodata-section"="sec2" }
+; IR: !0 = !{!"section_prefix", !"hot"}
+; IR-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; IR-NEXT: !2 = !{i32 2, !"EnableDataAccessProf", i32 1}
-; PREFIX: !0 = !{!"section_prefix", !"hot"}
-; PREFIX-NEXT: !1 = !{!"section_prefix", !"unlikely"}
+; FLAG: !{i32 2, !"EnableDataAccessProf", i32 0}
+; FLAGLESS-NOT: EnableDataAccessProf
; STAT: 1 memprof - Number of global vars annotated with 'unlikely' section prefix.
; STAT: 2 memprof - Number of global vars with user-specified section (not annotated).
@@ -72,6 +89,24 @@ DataAccessProfiles:
- foo
KnownColdStrHashes: [ 999, 1001 ]
...
+;--- memprof-no-dap.yaml
+---
+# A memprof file with without data access profiles. The heap records are simplified
+# to pass profile parsing and don't need to match the IR.
+HeapProfileRecords:
+ - GUID: 0xdeadbeef12345678
+ AllocSites:
+ - Callstack:
+ - { Function: 0x1111111111111111, LineOffset: 11, Column: 10, IsInlineFrame: true }
+ MemInfoBlock:
+ AllocCount: 111
+ TotalSize: 222
+ TotalLifetime: 333
+ TotalLifetimeAccessDensity: 444
+ CallSites:
+ - Frames:
+ - { Function: 0x5555555555555555, LineOffset: 55, Column: 50, IsInlineFrame: true }
+...
;--- input.ll
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
@@ -84,11 +119,14 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
define i32 @func() {
%a = load i32, ptr @var1
%b = load i32, ptr @var2.llvm.125
- %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b)
+ %c = load i32, ptr @llvm.fake_var
+ %ret = call i32 (...) @func_taking_arbitrary_param(i32 %a, i32 %b, i32 %c)
ret i32 %ret
}
@@ -108,5 +146,8 @@ target triple = "x86_64-unknown-linux-gnu"
@foo = global i8 2
@var3 = constant [2 x i32][i32 12345, i32 6789], section "sec1"
@var4 = constant [1 x i64][i64 98765] #0
+@llvm.fake_var = global i32 123
+@qux = external global i64
+
attributes #0 = { "rodata-section"="sec2" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
index c5f72f2..fded7a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll
@@ -4,21 +4,9 @@
define i32 @crash_reordering_undefs() {
; CHECK-LABEL: @crash_reordering_undefs(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]]
-; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef
-; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537
-; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef
-; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]]
-; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537
+; CHECK-NEXT: [[ADD0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> splat (i32 65537))
; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 undef, [[ADD0]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[ADD2]], [[ADD4]]
-; CHECK-NEXT: [[OP_RDX2:%.*]] = add i32 [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT: [[OP_RDX3:%.*]] = add i32 [[OP_RDX2]], [[ADD9]]
-; CHECK-NEXT: ret i32 [[OP_RDX3]]
+; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
%or0 = or i64 undef, undef
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
index 3ac0d01..13b050d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-with-cmp-user.ll
@@ -6,15 +6,15 @@ define i1 @test(i32 %g, i16 %d) {
; CHECK-SAME: i32 [[G:%.*]], i16 [[D:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = and i16 [[D]], 1
-; CHECK-NEXT: [[XOR_I_I:%.*]] = xor i32 [[G]], 1
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[G]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[XOR_I_I]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i32> [[TMP2]], <i32 0, i32 1>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP4:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i8>
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i8> [[TMP4]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[TMP5]], <i8 -9, i8 -9, i8 -1, i8 -1>
; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <4 x i8> [[TMP6]], splat (i8 -3)
; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i1> [[TMP7]] to <4 x i8>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i32> [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP12]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
index f07424f..43302f2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll
@@ -3,32 +3,7 @@
define i32 @test() {
; CHECK-LABEL: define i32 @test() {
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], <i32 0, i32 0, i32 0, i32 poison>
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> <i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 3, i32 3, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 7, i32 7, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <64 x i32> <i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 poison, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 64, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 poison, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 48, i32 49, i32 50, i32 51, i32 67, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 64, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 65, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 66, i32 67, i32 67, i32 67, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8>
-; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]])
+; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
; CHECK-NEXT: [[TMP24:%.*]] = sext i8 [[TMP23]] to i32
; CHECK-NEXT: ret i32 [[TMP24]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
index 1fedde4..3e9bd78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-subvectors-insert.ll
@@ -3,12 +3,8 @@
define void @test() {
; CHECK-LABEL: define void @test() {
-; CHECK-NEXT: [[XOR108_I_I_I:%.*]] = xor i64 0, 1
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0>, i64 [[XOR108_I_I_I]], i32 10
-; CHECK-NEXT: [[TMP2:%.*]] = lshr <12 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[XOR108_I_I_I]], i32 3
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <12 x i64> [[TMP2]], <12 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> [[TMP5]], <16 x i64> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i64> poison, i64 1, i32 3
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 1, i64 0, i64 undef, i64 undef, i64 undef, i64 undef>, <16 x i64> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i64> [[TMP6]], <16 x i64> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 3, i32 7, i32 8, i32 9, i32 3, i32 10, i32 11, i32 12, i32 3>
; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i64> [[TMP7]] to <16 x i1>
; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i1> [[TMP8]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
index 034fe82..c5442b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-scheduled-inst-reused-as-last-inst.ll
@@ -6,11 +6,10 @@
define void @foo() {
; CHECK-LABEL: define void @foo() {
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
@@ -24,11 +23,10 @@ define void @foo() {
;
; FORCED-LABEL: define void @foo() {
; FORCED-NEXT: bb:
-; FORCED-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 0, i32 0
; FORCED-NEXT: br label [[BB1:%.*]]
; FORCED: bb1:
; FORCED-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, [[BB:%.*]] ], [ [[TMP6:%.*]], [[BB4:%.*]] ]
-; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], [[TMP0]]
+; FORCED-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[TMP1]], zeroinitializer
; FORCED-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP1]], <2 x i32> <i32 0, i32 3>
; FORCED-NEXT: [[TMP6]] = or <2 x i32> [[TMP5]], zeroinitializer
; FORCED-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
index 2612a21..e8078ad 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll
@@ -5,23 +5,22 @@ define i32 @test(i1 %cond) {
; CHECK-LABEL: define i32 @test(
; CHECK-SAME: i1 [[COND:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[OR92:%.*]] = or i32 1, 0
; CHECK-NEXT: br label %[[BB:.*]]
; CHECK: [[BB]]:
-; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92:%.*]], %[[BB]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT: [[P1:%.*]] = phi i32 [ [[OR92]], %[[BB]] ], [ 0, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], %[[BB]] ], [ zeroinitializer, %[[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 0>, <4 x i32> <i32 poison, i32 1, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]]
-; CHECK-NEXT: [[OR92]] = or i32 1, 0
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[OR92]], i32 0
-; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], [[TMP7]]
-; CHECK-NEXT: [[OP_RDX:%.*]] = xor i32 [[TMP6]], [[OR92]]
+; CHECK-NEXT: [[TMP8]] = xor <2 x i32> [[TMP9]], <i32 1, i32 0>
; CHECK-NEXT: br i1 [[COND]], label %[[EXIT:.*]], label %[[BB]]
; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP8]], i32 0
; CHECK-NEXT: ret i32 [[OP_RDX]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
index 4a5dd2a..b9f8390 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reschedule-only-scheduled.ll
@@ -8,42 +8,21 @@ define i16 @test() {
; CHECK-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 0
; CHECK-NEXT: [[CALL99_I:%.*]] = call i32 @llvm.bswap.i32(i32 0)
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[CALL99_I]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = shl i32 0, 0
-; CHECK-NEXT: [[UNSCLEAR186_I:%.*]] = and i32 [[TMP6]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = shl i32 0, 0
; CHECK-NEXT: [[CALL7_I45:%.*]] = tail call i32 null(i32 0)
; CHECK-NEXT: [[TMP8:%.*]] = lshr i32 [[CALL7_I45]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 0, 0
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT: [[TMP15:%.*]] = and <2 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <24 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 0>, <24 x i32> [[TMP16]], <24 x i32> <i32 0, i32 1, i32 24, i32 25, i32 poison, i32 5, i32 poison, i32 7, i32 poison, i32 poison, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <24 x i32> [[TMP17]], <24 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 24, i32 5, i32 26, i32 7, i32 28, i32 29, i32 poison, i32 11, i32 poison, i32 poison, i32 poison, i32 15, i32 poison, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <24 x i32> [[TMP18]], i32 [[UNSCLEAR186_I]], i32 10
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <24 x i32> [[TMP19]], <24 x i32> [[TMP20]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <24 x i32> [[TMP21]], <24 x i32> [[TMP22]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 24, i32 15, i32 25, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 22, i32 23>
-; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <24 x i32> [[TMP23]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 24, i32 25, i32 26, i32 27, i32 22, i32 23>
-; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <24 x i32> [[TMP24]], <24 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 poison, i32 0, i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, <24 x i32> <i32 24, i32 25, i32 26, i32 27, i32 28, i32 4, i32 30, i32 6, i32 32, i32 33, i32 34, i32 poison, i32 36, i32 37, i32 38, i32 poison, i32 40, i32 poison, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <24 x i32> [[TMP25]], i32 [[UNSCLEAR186_I]], i32 11
-; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <24 x i32> <i32 0, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <24 x i32> [[TMP26]], <24 x i32> [[TMP27]], <24 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 24, i32 16, i32 26, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <24 x i32> [[TMP24]], [[TMP28]]
-; CHECK-NEXT: [[RDX_OP:%.*]] = shufflevector <24 x i1> [[TMP29]], <24 x i1> <i1 false, i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast <28 x i1> [[RDX_OP]] to i28
-; CHECK-NEXT: [[TMP31:%.*]] = call i28 @llvm.ctpop.i28(i28 [[TMP30]])
-; CHECK-NEXT: [[TMP32:%.*]] = trunc i28 [[TMP31]] to i16
-; CHECK-NEXT: [[TMP33:%.*]] = call i4 @llvm.ctpop.i4(i4 -8)
-; CHECK-NEXT: [[TMP34:%.*]] = zext i4 [[TMP33]] to i16
-; CHECK-NEXT: [[OP_RDX4:%.*]] = add i16 [[TMP34]], [[TMP32]]
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <28 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison>, i32 [[TMP1]], i32 4
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <28 x i32> [[TMP4]], i32 [[TMP2]], i32 5
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <28 x i32> [[TMP5]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 28, i32 29, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <28 x i32> [[TMP6]], i32 [[TMP8]], i32 12
+; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <28 x i32> [[TMP7]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 12, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <28 x i32> [[TMP16]], <28 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 28, i32 29, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <28 x i32> [[TMP9]], <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, <28 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 28, i32 29, i32 30, i32 31, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27>
+; CHECK-NEXT: [[TMP11:%.*]] = and <28 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison>, [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <28 x i32> [[TMP11]], <28 x i32> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 9, i32 10, i32 10, i32 11, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
+; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1>, [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i1> [[TMP13]] to i32
+; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.ctpop.i32(i32 [[TMP14]])
+; CHECK-NEXT: [[OP_RDX4:%.*]] = trunc i32 [[TMP15]] to i16
; CHECK-NEXT: ret i16 [[OP_RDX4]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
index a7f8629..78708a2 100644
--- a/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
+++ b/llvm/test/Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
@@ -6,20 +6,12 @@ define void @test() {
; CHECK-LABEL: define void @test() {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr null, align 2
-; CHECK-NEXT: [[TMP1:%.*]] = and i8 0, 1
; CHECK-NEXT: [[TMP2:%.*]] = and i32 0, 0
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 0, i8 poison, i8 poison, i8 poison>, i8 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> zeroinitializer, zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
-; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP15]] to <4 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 poison, i32 0, i32 0>, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
-; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 0, [[TMP14]]
; CHECK-NEXT: store i32 [[OP_RDX]], ptr null, align 4
diff --git a/llvm/test/Verifier/llvm.used-invalid-init.ll b/llvm/test/Verifier/llvm.used-invalid-init.ll
index 15a961c..38c84b15 100644
--- a/llvm/test/Verifier/llvm.used-invalid-init.ll
+++ b/llvm/test/Verifier/llvm.used-invalid-init.ll
@@ -2,5 +2,5 @@
@llvm.used = appending global [1 x ptr] zeroinitializer, section "llvm.metadata"
-; CHECK: wrong initalizer for intrinsic global variable
+; CHECK: wrong initializer for intrinsic global variable
; CHECK-NEXT: [1 x ptr] zeroinitializer
diff --git a/llvm/tools/bugpoint/BugDriver.cpp b/llvm/tools/bugpoint/BugDriver.cpp
index 2bdfebe..a7e93f6 100644
--- a/llvm/tools/bugpoint/BugDriver.cpp
+++ b/llvm/tools/bugpoint/BugDriver.cpp
@@ -27,9 +27,7 @@
#include <memory>
using namespace llvm;
-namespace llvm {
-Triple TargetTriple;
-}
+Triple llvm::TargetTriple;
DiscardTemp::~DiscardTemp() {
if (SaveTemps) {
@@ -41,18 +39,14 @@ DiscardTemp::~DiscardTemp() {
errs() << "Failed to delete temp file " << toString(std::move(E)) << '\n';
}
-// Anonymous namespace to define command line options for debugging.
-//
-namespace {
// Output - The user can specify a file containing the expected output of the
// program. If this filename is set, it is used as the reference diff source,
// otherwise the raw input run through an interpreter is used as the reference
// source.
//
-cl::opt<std::string> OutputFile("output",
- cl::desc("Specify a reference program output "
- "(for miscompilation detection)"));
-}
+static cl::opt<std::string>
+ OutputFile("output", cl::desc("Specify a reference program output "
+ "(for miscompilation detection)"));
/// If we reduce or update the program somehow, call this method to update
/// bugdriver with it. This deletes the old module and sets the specified one
@@ -238,7 +232,7 @@ Error BugDriver::run() {
return Error::success();
}
-void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
+void llvm::printFunctionList(const std::vector<Function *> &Funcs) {
unsigned NumPrint = Funcs.size();
if (NumPrint > 10)
NumPrint = 10;
@@ -249,7 +243,7 @@ void llvm::PrintFunctionList(const std::vector<Function *> &Funcs) {
outs().flush();
}
-void llvm::PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
+void llvm::printGlobalVariableList(const std::vector<GlobalVariable *> &GVs) {
unsigned NumPrint = GVs.size();
if (NumPrint > 10)
NumPrint = 10;
diff --git a/llvm/tools/bugpoint/BugDriver.h b/llvm/tools/bugpoint/BugDriver.h
index e3117ec..ca57405 100644
--- a/llvm/tools/bugpoint/BugDriver.h
+++ b/llvm/tools/bugpoint/BugDriver.h
@@ -57,7 +57,6 @@ class BugDriver {
// FIXME: sort out public/private distinctions...
friend class ReducePassList;
- friend class ReduceMisCodegenFunctions;
public:
BugDriver(const char *toolname, bool find_bugs, unsigned timeout,
@@ -76,7 +75,7 @@ public:
void setPassesToRun(const std::vector<std::string> &PTR) {
PassesToRun = PTR;
}
- const std::vector<std::string> &getPassesToRun() const { return PassesToRun; }
+ ArrayRef<std::string> getPassesToRun() const { return PassesToRun; }
/// run - The top level method that is invoked after all of the instance
/// variables are set up from command line arguments. The \p as_child argument
@@ -111,7 +110,6 @@ public:
Error debugCodeGenerator();
/// isExecutingJIT - Returns true if bugpoint is currently testing the JIT
- ///
bool isExecutingJIT();
Module &getProgram() const { return *Program; }
@@ -167,7 +165,7 @@ public:
bool RemoveBitcode = false) const;
/// This function is used to output M to a file named "bugpoint-ID.bc".
- void EmitProgressBitcode(const Module &M, const std::string &ID,
+ void emitProgressBitcode(const Module &M, const std::string &ID,
bool NoFlyer = false) const;
/// This method clones the current Program and deletes the specified
@@ -214,7 +212,6 @@ public:
/// outs() a single line message indicating whether compilation was successful
/// or failed, unless Quiet is set. ExtraArgs specifies additional arguments
/// to pass to the child bugpoint instance.
- ///
bool runPasses(Module &Program, const std::vector<std::string> &PassesToRun,
std::string &OutputFilename, bool DeleteOutput = false,
bool Quiet = false,
@@ -223,7 +220,6 @@ public:
/// runPasses - Just like the method above, but this just returns true or
/// false indicating whether or not the optimizer crashed on the specified
/// input (true = crashed). Does not produce any output.
- ///
bool runPasses(Module &M, const std::vector<std::string> &PassesToRun) const {
std::string Filename;
return runPasses(M, PassesToRun, Filename, true);
@@ -247,7 +243,6 @@ public:
private:
/// initializeExecutionEnvironment - This method is used to set up the
/// environment for executing LLVM programs.
- ///
Error initializeExecutionEnvironment();
};
@@ -258,37 +253,31 @@ struct DiscardTemp {
/// Given a bitcode or assembly input filename, parse and return it, or return
/// null if not possible.
-///
std::unique_ptr<Module> parseInputFile(StringRef InputFilename,
LLVMContext &ctxt);
/// getPassesString - Turn a list of passes into a string which indicates the
/// command line options that must be passed to add the passes.
-///
std::string getPassesString(const std::vector<std::string> &Passes);
-/// PrintFunctionList - prints out list of problematic functions
-///
-void PrintFunctionList(const std::vector<Function *> &Funcs);
+/// Prints out list of problematic functions
+void printFunctionList(const std::vector<Function *> &Funcs);
-/// PrintGlobalVariableList - prints out list of problematic global variables
-///
-void PrintGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
+/// Prints out list of problematic global variables
+void printGlobalVariableList(const std::vector<GlobalVariable *> &GVs);
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer, making it external.
-//
-void DeleteGlobalInitializer(GlobalVariable *GV);
+/// "Remove" the global variable by deleting its initializer, making it
+/// external.
+void deleteGlobalInitializer(GlobalVariable *GV);
-// DeleteFunctionBody - "Remove" the function by deleting all of it's basic
-// blocks, making it external.
-//
-void DeleteFunctionBody(Function *F);
+/// "Remove" the function by deleting all of it's basic blocks, making it
+/// external.
+void deleteFunctionBody(Function *F);
/// Given a module and a list of functions in the module, split the functions
/// OUT of the specified module, and place them in the new module.
std::unique_ptr<Module>
-SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
ValueToValueMapTy &VMap);
} // End llvm namespace
diff --git a/llvm/tools/bugpoint/CrashDebugger.cpp b/llvm/tools/bugpoint/CrashDebugger.cpp
index fcac014..240300b 100644
--- a/llvm/tools/bugpoint/CrashDebugger.cpp
+++ b/llvm/tools/bugpoint/CrashDebugger.cpp
@@ -36,39 +36,44 @@
#include <set>
using namespace llvm;
-namespace {
-cl::opt<bool> KeepMain("keep-main",
- cl::desc("Force function reduction to keep main"),
- cl::init(false));
-cl::opt<bool> NoGlobalRM("disable-global-remove",
- cl::desc("Do not remove global variables"),
- cl::init(false));
-
-cl::opt<bool> NoAttributeRM("disable-attribute-remove",
- cl::desc("Do not remove function attributes"),
- cl::init(false));
-
-cl::opt<bool> ReplaceFuncsWithNull(
+static cl::opt<bool> KeepMain("keep-main",
+ cl::desc("Force function reduction to keep main"),
+ cl::init(false));
+static cl::opt<bool> NoGlobalRM("disable-global-remove",
+ cl::desc("Do not remove global variables"),
+ cl::init(false));
+
+static cl::opt<bool>
+ NoAttributeRM("disable-attribute-remove",
+ cl::desc("Do not remove function attributes"),
+ cl::init(false));
+
+static cl::opt<bool> ReplaceFuncsWithNull(
"replace-funcs-with-null",
cl::desc("When stubbing functions, replace all uses will null"),
cl::init(false));
-cl::opt<bool> DontReducePassList("disable-pass-list-reduction",
- cl::desc("Skip pass list reduction steps"),
- cl::init(false));
-
-cl::opt<bool> NoNamedMDRM("disable-namedmd-remove",
- cl::desc("Do not remove global named metadata"),
- cl::init(false));
-cl::opt<bool> NoStripDebugInfo("disable-strip-debuginfo",
- cl::desc("Do not strip debug info metadata"),
- cl::init(false));
-cl::opt<bool> NoStripDebugTypeInfo("disable-strip-debug-types",
- cl::desc("Do not strip debug type info metadata"),
- cl::init(false));
-cl::opt<bool> VerboseErrors("verbose-errors",
- cl::desc("Print the output of crashing program"),
- cl::init(false));
-}
+
+static cl::opt<bool>
+ DontReducePassList("disable-pass-list-reduction",
+ cl::desc("Skip pass list reduction steps"),
+ cl::init(false));
+
+static cl::opt<bool>
+ NoNamedMDRM("disable-namedmd-remove",
+ cl::desc("Do not remove global named metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ NoStripDebugInfo("disable-strip-debuginfo",
+ cl::desc("Do not strip debug info metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ NoStripDebugTypeInfo("disable-strip-debug-types",
+ cl::desc("Do not strip debug type info metadata"),
+ cl::init(false));
+static cl::opt<bool>
+ VerboseErrors("verbose-errors",
+ cl::desc("Print the output of crashing program"),
+ cl::init(false));
static bool isValidModule(std::unique_ptr<Module> &M,
bool ExitOnFailure = true) {
@@ -83,6 +88,8 @@ static bool isValidModule(std::unique_ptr<Module> &M,
}
namespace llvm {
+// Note this class needs to be in llvm namespace since its declared as a friend
+// of BugDriver.
class ReducePassList : public ListReducer<std::string> {
BugDriver &BD;
@@ -95,7 +102,7 @@ public:
Expected<TestResult> doTest(std::vector<std::string> &Removed,
std::vector<std::string> &Kept) override;
};
-}
+} // namespace llvm
Expected<ReducePassList::TestResult>
ReducePassList::doTest(std::vector<std::string> &Prefix,
@@ -156,7 +163,7 @@ public:
bool TestGlobalVariables(std::vector<GlobalVariable *> &GVs);
};
-}
+} // namespace
bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
std::vector<GlobalVariable *> &GVs) {
@@ -174,14 +181,14 @@ bool ReduceCrashingGlobalInitializers::TestGlobalVariables(
}
outs() << "Checking for crash with only these global variables: ";
- PrintGlobalVariableList(GVs);
+ printGlobalVariableList(GVs);
outs() << ": ";
// Loop over and delete any global variables which we aren't supposed to be
// playing with...
for (GlobalVariable &I : M->globals())
if (I.hasInitializer() && !GVSet.count(&I)) {
- DeleteGlobalInitializer(&I);
+ deleteGlobalInitializer(&I);
I.setLinkage(GlobalValue::ExternalLinkage);
I.setComdat(nullptr);
}
@@ -223,7 +230,7 @@ public:
bool TestFuncs(std::vector<Function *> &Prefix);
};
-}
+} // namespace
static void RemoveFunctionReferences(Module *M, const char *Name) {
auto *UsedVar = M->getGlobalVariable(Name, true);
@@ -269,14 +276,14 @@ bool ReduceCrashingFunctions::TestFuncs(std::vector<Function *> &Funcs) {
}
outs() << "Checking for crash with only these functions: ";
- PrintFunctionList(Funcs);
+ printFunctionList(Funcs);
outs() << ": ";
if (!ReplaceFuncsWithNull) {
// Loop over and delete any functions which we aren't supposed to be playing
// with...
for (Function &I : *M)
if (!I.isDeclaration() && !Functions.count(&I))
- DeleteFunctionBody(&I);
+ deleteFunctionBody(&I);
} else {
std::vector<GlobalValue *> ToRemove;
// First, remove aliases to functions we're about to purge.
@@ -356,7 +363,7 @@ public:
bool TestFuncAttrs(std::vector<Attribute> &Attrs);
};
-}
+} // namespace
bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
std::vector<Attribute> &Attrs) {
@@ -396,12 +403,11 @@ bool ReduceCrashingFunctionAttributes::TestFuncAttrs(
return false;
}
-namespace {
/// Simplify the CFG without completely destroying it.
/// This is not well defined, but basically comes down to "try to eliminate
/// unreachable blocks and constant fold terminators without deciding that
/// certain undefined behavior cuts off the program at the legs".
-void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
+static void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
if (F.empty())
return;
@@ -435,6 +441,8 @@ void simpleSimplifyCfg(Function &F, SmallVectorImpl<BasicBlock *> &BBs) {
for (auto *BB : Unreachable)
BB->eraseFromParent();
}
+
+namespace {
/// ReduceCrashingBlocks reducer - This works by setting the terminators of
/// all terminators except the specified basic blocks to a 'ret' instruction,
/// then running the simplifycfg pass. This has the effect of chopping up
@@ -459,7 +467,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingBlocks::TestBlocks(std::vector<const BasicBlock *> &BBs) {
// Clone the program to try hacking it apart...
@@ -571,7 +579,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingConditionals::TestBlocks(
std::vector<const BasicBlock *> &BBs) {
@@ -670,7 +678,7 @@ public:
bool TestBlocks(std::vector<const BasicBlock *> &Prefix);
};
-}
+} // namespace
bool ReduceSimplifyCFG::TestBlocks(std::vector<const BasicBlock *> &BBs) {
// Clone the program to try hacking it apart...
@@ -755,7 +763,7 @@ public:
bool TestInsts(std::vector<const Instruction *> &Prefix);
};
-}
+} // namespace
bool ReduceCrashingInstructions::TestInsts(
std::vector<const Instruction *> &Insts) {
@@ -896,7 +904,7 @@ public:
bool TestNamedMDs(std::vector<std::string> &NamedMDs);
};
-}
+} // namespace
bool ReduceCrashingNamedMD::TestNamedMDs(std::vector<std::string> &NamedMDs) {
@@ -959,7 +967,7 @@ public:
bool TestNamedMDOps(std::vector<const MDNode *> &NamedMDOps);
};
-}
+} // namespace
bool ReduceCrashingNamedMDOps::TestNamedMDOps(
std::vector<const MDNode *> &NamedMDOps) {
@@ -1018,7 +1026,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
for (GlobalVariable &GV : M->globals()) {
if (GV.hasInitializer()) {
- DeleteGlobalInitializer(&GV);
+ deleteGlobalInitializer(&GV);
GV.setLinkage(GlobalValue::ExternalLinkage);
GV.setComdat(nullptr);
DeletedInit = true;
@@ -1056,7 +1064,7 @@ static Error ReduceGlobalInitializers(BugDriver &BD, BugTester TestFn) {
return E;
if (GVs.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-global-variables");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-global-variables");
}
return Error::success();
}
@@ -1155,7 +1163,7 @@ static Error ReduceInsts(BugDriver &BD, BugTester TestFn) {
return E;
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-instructions");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-instructions");
return Error::success();
}
@@ -1186,7 +1194,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
return E;
if (Functions.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-function");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-function");
}
if (!NoAttributeRM) {
@@ -1218,7 +1226,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
}
if (OldSize < NewSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-function-attributes");
}
}
@@ -1238,7 +1246,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-conditionals");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-conditionals");
}
// Attempt to delete entire basic blocks at a time to speed up
@@ -1256,7 +1264,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-blocks");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-blocks");
}
if (!DisableSimplifyCFG && !BugpointIsInterrupted) {
@@ -1269,7 +1277,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
if (Blocks.size() < OldSize)
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-simplifycfg");
}
// Attempt to delete instructions using bisection. This should help out nasty
@@ -1319,7 +1327,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
if (Error E = Result.takeError())
return E;
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-named-md");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-named-md");
}
// Try to clean up the testcase by running funcresolve and globaldce...
@@ -1334,7 +1342,7 @@ static Error DebugACrash(BugDriver &BD, BugTester TestFn) {
std::move(M)); // Yup, it does, keep the reduced version...
}
- BD.EmitProgressBitcode(BD.getProgram(), "reduced-simplified");
+ BD.emitProgressBitcode(BD.getProgram(), "reduced-simplified");
return Error::success();
}
@@ -1361,7 +1369,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
<< (PassesToRun.size() == 1 ? ": " : "es: ")
<< getPassesString(PassesToRun) << '\n';
- EmitProgressBitcode(*Program, ID);
+ emitProgressBitcode(*Program, ID);
auto Res = DebugACrash(*this, TestForOptimizerCrash);
if (Res || DontReducePassList)
@@ -1376,7 +1384,7 @@ Error BugDriver::debugOptimizerCrash(const std::string &ID) {
<< (PassesToRun.size() == 1 ? ": " : "es: ")
<< getPassesString(PassesToRun) << '\n';
- EmitProgressBitcode(getProgram(), "reduced-simplified");
+ emitProgressBitcode(getProgram(), "reduced-simplified");
return Res;
}
diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index 165b55f..8c6b7fb 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -36,15 +36,16 @@ enum OutputType {
CompileCustom,
Custom
};
+} // namespace
-cl::opt<double> AbsTolerance("abs-tolerance",
- cl::desc("Absolute error tolerated"),
- cl::init(0.0));
-cl::opt<double> RelTolerance("rel-tolerance",
- cl::desc("Relative error tolerated"),
- cl::init(0.0));
+static cl::opt<double> AbsTolerance("abs-tolerance",
+ cl::desc("Absolute error tolerated"),
+ cl::init(0.0));
+static cl::opt<double> RelTolerance("rel-tolerance",
+ cl::desc("Relative error tolerated"),
+ cl::init(0.0));
-cl::opt<OutputType> InterpreterSel(
+static cl::opt<OutputType> InterpreterSel(
cl::desc("Specify the \"test\" i.e. suspect back-end:"),
cl::values(clEnumValN(AutoPick, "auto", "Use best guess"),
clEnumValN(RunLLI, "run-int", "Execute with the interpreter"),
@@ -60,7 +61,7 @@ cl::opt<OutputType> InterpreterSel(
"the bitcode. Useful for cross-compilation.")),
cl::init(AutoPick));
-cl::opt<OutputType> SafeInterpreterSel(
+static cl::opt<OutputType> SafeInterpreterSel(
cl::desc("Specify \"safe\" i.e. known-good backend:"),
cl::values(clEnumValN(AutoPick, "safe-auto", "Use best guess"),
clEnumValN(RunLLC, "safe-run-llc", "Compile with LLC"),
@@ -69,16 +70,16 @@ cl::opt<OutputType> SafeInterpreterSel(
"the bitcode. Useful for cross-compilation.")),
cl::init(AutoPick));
-cl::opt<std::string> SafeInterpreterPath(
+static cl::opt<std::string> SafeInterpreterPath(
"safe-path", cl::desc("Specify the path to the \"safe\" backend program"),
cl::init(""));
-cl::opt<bool> AppendProgramExitCode(
+static cl::opt<bool> AppendProgramExitCode(
"append-exit-code",
cl::desc("Append the exit code to the output so it gets diff'd too"),
cl::init(false));
-cl::opt<std::string>
+static cl::opt<std::string>
InputFile("input", cl::init("/dev/null"),
cl::desc("Filename to pipe in as stdin (default: /dev/null)"));
@@ -89,20 +90,19 @@ static cl::list<std::string>
static cl::list<std::string> AdditionalLinkerArgs(
"Xlinker", cl::desc("Additional arguments to pass to the linker"));
-cl::opt<std::string> CustomCompileCommand(
+static cl::opt<std::string> CustomCompileCommand(
"compile-command", cl::init("llc"),
cl::desc("Command to compile the bitcode (use with -compile-custom) "
"(default: llc)"));
-cl::opt<std::string> CustomExecCommand(
+static cl::opt<std::string> CustomExecCommand(
"exec-command", cl::init("simulate"),
cl::desc("Command to execute the bitcode (use with -run-custom) "
"(default: simulate)"));
-}
-namespace llvm {
// Anything specified after the --args option are taken as arguments to the
// program being debugged.
+namespace llvm {
cl::list<std::string> InputArgv("args", cl::Positional,
cl::desc("<program arguments>..."),
cl::PositionalEatsArgs);
@@ -110,25 +110,22 @@ cl::list<std::string> InputArgv("args", cl::Positional,
cl::opt<std::string>
OutputPrefix("output-prefix", cl::init("bugpoint"),
cl::desc("Prefix to use for outputs (default: 'bugpoint')"));
-}
-
-namespace {
-cl::list<std::string> ToolArgv("tool-args", cl::Positional,
- cl::desc("<tool arguments>..."),
- cl::PositionalEatsArgs);
+} // namespace llvm
-cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
- cl::desc("<safe-tool arguments>..."),
- cl::PositionalEatsArgs);
+static cl::list<std::string> ToolArgv("tool-args", cl::Positional,
+ cl::desc("<tool arguments>..."),
+ cl::PositionalEatsArgs);
-cl::opt<std::string> CCBinary("gcc", cl::init(""),
- cl::desc("The gcc binary to use."));
+static cl::list<std::string> SafeToolArgv("safe-tool-args", cl::Positional,
+ cl::desc("<safe-tool arguments>..."),
+ cl::PositionalEatsArgs);
-cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
- cl::desc("<gcc-tool arguments>..."),
- cl::PositionalEatsArgs);
-}
+static cl::opt<std::string> CCBinary("gcc", cl::init(""),
+ cl::desc("The gcc binary to use."));
+static cl::list<std::string> CCToolArgv("gcc-tool-args", cl::Positional,
+ cl::desc("<gcc-tool arguments>..."),
+ cl::PositionalEatsArgs);
//===----------------------------------------------------------------------===//
// BugDriver method implementation
//
diff --git a/llvm/tools/bugpoint/ExtractFunction.cpp b/llvm/tools/bugpoint/ExtractFunction.cpp
index dd9a82c..3206589 100644
--- a/llvm/tools/bugpoint/ExtractFunction.cpp
+++ b/llvm/tools/bugpoint/ExtractFunction.cpp
@@ -35,19 +35,19 @@ using namespace llvm;
#define DEBUG_TYPE "bugpoint"
+bool llvm::DisableSimplifyCFG = false;
namespace llvm {
-bool DisableSimplifyCFG = false;
extern cl::opt<std::string> OutputPrefix;
-} // End llvm namespace
+} // namespace llvm
-namespace {
-cl::opt<bool> NoDCE("disable-dce",
- cl::desc("Do not use the -dce pass to reduce testcases"));
-cl::opt<bool, true>
+static cl::opt<bool>
+ NoDCE("disable-dce",
+ cl::desc("Do not use the -dce pass to reduce testcases"));
+static cl::opt<bool, true>
NoSCFG("disable-simplifycfg", cl::location(DisableSimplifyCFG),
cl::desc("Do not use the -simplifycfg pass to reduce testcases"));
-Function *globalInitUsesExternalBA(GlobalVariable *GV) {
+static Function *globalInitUsesExternalBA(GlobalVariable *GV) {
if (!GV->hasInitializer())
return nullptr;
@@ -78,7 +78,6 @@ Function *globalInitUsesExternalBA(GlobalVariable *GV) {
}
return nullptr;
}
-} // end anonymous namespace
std::unique_ptr<Module>
BugDriver::deleteInstructionFromProgram(const Instruction *I,
@@ -154,7 +153,7 @@ std::unique_ptr<Module> BugDriver::extractLoop(Module *M) {
std::unique_ptr<Module> NewM = runPassesOn(M, LoopExtractPasses);
if (!NewM) {
outs() << "*** Loop extraction failed: ";
- EmitProgressBitcode(*M, "loopextraction", true);
+ emitProgressBitcode(*M, "loopextraction", true);
outs() << "*** Sorry. :( Please report a bug!\n";
return nullptr;
}
@@ -198,21 +197,16 @@ static void eliminateAliases(GlobalValue *GV) {
}
}
-//
-// DeleteGlobalInitializer - "Remove" the global variable by deleting its
-// initializer,
-// making it external.
-//
-void llvm::DeleteGlobalInitializer(GlobalVariable *GV) {
+// "Remove" the global variable by deleting its initializer, making it external.
+void llvm::deleteGlobalInitializer(GlobalVariable *GV) {
eliminateAliases(GV);
GV->setInitializer(nullptr);
GV->setComdat(nullptr);
}
-// DeleteFunctionBody - "Remove" the function by deleting all of its basic
-// blocks, making it external.
-//
-void llvm::DeleteFunctionBody(Function *F) {
+// "Remove" the function by deleting all of its basic blocks, making it
+// external.
+void llvm::deleteFunctionBody(Function *F) {
eliminateAliases(F);
// Function declarations can't have comdats.
F->setComdat(nullptr);
@@ -222,9 +216,9 @@ void llvm::DeleteFunctionBody(Function *F) {
assert(F->isDeclaration() && "This didn't make the function external!");
}
-/// GetTorInit - Given a list of entries for static ctors/dtors, return them
+/// getTorInit - Given a list of entries for static ctors/dtors, return them
/// as a constant array.
-static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
+static Constant *getTorInit(std::vector<std::pair<Function *, int>> &TorList) {
assert(!TorList.empty() && "Don't create empty tor list!");
std::vector<Constant *> ArrayElts;
Type *Int32Ty = Type::getInt32Ty(TorList[0].first->getContext());
@@ -239,11 +233,11 @@ static Constant *GetTorInit(std::vector<std::pair<Function *, int>> &TorList) {
ArrayType::get(ArrayElts[0]->getType(), ArrayElts.size()), ArrayElts);
}
-/// SplitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
+/// splitStaticCtorDtor - A module was recently split into two parts, M1/M2, and
/// M1 has all of the global variables. If M2 contains any functions that are
/// static ctors/dtors, we need to add an llvm.global_[cd]tors global to M2, and
/// prune appropriate entries out of M1s list.
-static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
+static void splitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
ValueToValueMapTy &VMap) {
GlobalVariable *GV = M1->getNamedGlobal(GlobalName);
if (!GV || GV->isDeclaration() || GV->hasLocalLinkage() || !GV->use_empty())
@@ -284,7 +278,7 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
GV->eraseFromParent();
if (!M1Tors.empty()) {
- Constant *M1Init = GetTorInit(M1Tors);
+ Constant *M1Init = getTorInit(M1Tors);
new GlobalVariable(*M1, M1Init->getType(), false,
GlobalValue::AppendingLinkage, M1Init, GlobalName);
}
@@ -295,14 +289,14 @@ static void SplitStaticCtorDtor(const char *GlobalName, Module *M1, Module *M2,
GV->eraseFromParent();
if (!M2Tors.empty()) {
- Constant *M2Init = GetTorInit(M2Tors);
+ Constant *M2Init = getTorInit(M2Tors);
new GlobalVariable(*M2, M2Init->getType(), false,
GlobalValue::AppendingLinkage, M2Init, GlobalName);
}
}
std::unique_ptr<Module>
-llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
+llvm::splitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
ValueToValueMapTy &VMap) {
// Make sure functions & globals are all external so that linkage
// between the two modules will work.
@@ -326,13 +320,13 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
LLVM_DEBUG(TNOF->printAsOperand(errs(), false));
LLVM_DEBUG(errs() << "\n");
TestFunctions.insert(cast<Function>(NewVMap[TNOF]));
- DeleteFunctionBody(TNOF); // Function is now external in this module!
+ deleteFunctionBody(TNOF); // Function is now external in this module!
}
// Remove the Safe functions from the Test module
for (Function &I : *New)
if (!TestFunctions.count(&I))
- DeleteFunctionBody(&I);
+ deleteFunctionBody(&I);
// Try to split the global initializers evenly
for (GlobalVariable &I : M->globals()) {
@@ -348,17 +342,17 @@ llvm::SplitFunctionsOutOfModule(Module *M, const std::vector<Function *> &F,
<< TestFn->getName() << "'.\n";
exit(1);
}
- DeleteGlobalInitializer(&I); // Delete the initializer to make it external
+ deleteGlobalInitializer(&I); // Delete the initializer to make it external
} else {
// If we keep it in the safe module, then delete it in the test module
- DeleteGlobalInitializer(GV);
+ deleteGlobalInitializer(GV);
}
}
// Make sure that there is a global ctor/dtor array in both halves of the
// module if they both have static ctor/dtor functions.
- SplitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
- SplitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
+ splitStaticCtorDtor("llvm.global_ctors", M, New.get(), NewVMap);
+ splitStaticCtorDtor("llvm.global_dtors", M, New.get(), NewVMap);
return New;
}
@@ -375,7 +369,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
outs() << "*** Basic Block extraction failed!\n";
errs() << "Error creating temporary file: " << toString(Temp.takeError())
<< "\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
return nullptr;
}
DiscardTemp Discard{*Temp};
@@ -399,7 +393,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
OS.flush();
if (OS.has_error()) {
errs() << "Error writing list of blocks to not extract\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
OS.clear_error();
return nullptr;
}
@@ -413,7 +407,7 @@ BugDriver::extractMappedBlocksFromModule(const std::vector<BasicBlock *> &BBs,
if (!Ret) {
outs() << "*** Basic Block extraction failed, please report a bug!\n";
- EmitProgressBitcode(*M, "basicblockextractfail", true);
+ emitProgressBitcode(*M, "basicblockextractfail", true);
}
return Ret;
}
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index 4cf7de3..a7f1643 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -33,16 +33,16 @@ extern cl::opt<std::string> OutputPrefix;
extern cl::list<std::string> InputArgv;
} // end namespace llvm
-namespace {
-static llvm::cl::opt<bool> DisableLoopExtraction(
+static cl::opt<bool> DisableLoopExtraction(
"disable-loop-extraction",
cl::desc("Don't extract loops when searching for miscompilations"),
cl::init(false));
-static llvm::cl::opt<bool> DisableBlockExtraction(
+static cl::opt<bool> DisableBlockExtraction(
"disable-block-extraction",
cl::desc("Don't extract blocks when searching for miscompilations"),
cl::init(false));
+namespace {
class ReduceMiscompilingPasses : public ListReducer<std::string> {
BugDriver &BD;
@@ -71,7 +71,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Suffix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -113,7 +113,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Prefix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -158,7 +158,7 @@ ReduceMiscompilingPasses::doTest(std::vector<std::string> &Prefix,
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
BD.setPassesToRun(Suffix);
- BD.EmitProgressBitcode(BD.getProgram(), "pass-error", false);
+ BD.emitProgressBitcode(BD.getProgram(), "pass-error", false);
// TODO: This should propagate the error instead of exiting.
if (Error E = BD.debugOptimizerCrash())
exit(1);
@@ -253,7 +253,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
<< (Funcs.size() == 1 ? "this function is" : "these functions are")
<< " run through the pass"
<< (BD.getPassesToRun().size() == 1 ? "" : "es") << ":";
- PrintFunctionList(Funcs);
+ printFunctionList(Funcs);
outs() << '\n';
// Create a clone for two reasons:
@@ -277,7 +277,7 @@ ReduceMiscompilingFunctions::TestFuncs(const std::vector<Function *> &Funcs) {
VMap.clear();
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+ splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
Expected<bool> Broken =
TestFn(BD, std::move(ToOptimize), std::move(ToNotOptimize));
@@ -314,7 +314,7 @@ ExtractLoops(BugDriver &BD,
ValueToValueMapTy VMap;
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
- std::unique_ptr<Module> ToOptimize = SplitFunctionsOutOfModule(
+ std::unique_ptr<Module> ToOptimize = splitFunctionsOutOfModule(
ToNotOptimize.get(), MiscompiledFunctions, VMap);
std::unique_ptr<Module> ToOptimizeLoopExtracted =
BD.extractLoop(ToOptimize.get());
@@ -517,7 +517,7 @@ ReduceMiscompiledBlocks::TestFuncs(const std::vector<BasicBlock *> &BBs) {
std::unique_ptr<Module> ToNotOptimize = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
+ splitFunctionsOutOfModule(ToNotOptimize.get(), FuncsOnClone, VMap);
// Try the extraction. If it doesn't work, then the block extractor crashed
// or something, in which case bugpoint can't chase down this possibility.
@@ -572,7 +572,7 @@ ExtractBlocks(BugDriver &BD,
ValueToValueMapTy VMap;
std::unique_ptr<Module> ProgClone = CloneModule(BD.getProgram(), VMap);
std::unique_ptr<Module> ToExtract =
- SplitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
+ splitFunctionsOutOfModule(ProgClone.get(), MiscompiledFunctions, VMap);
std::unique_ptr<Module> Extracted =
BD.extractMappedBlocksFromModule(Blocks, ToExtract.get());
if (!Extracted) {
@@ -638,7 +638,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
// See if we can rip any loops out of the miscompiled functions and still
@@ -663,7 +663,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
}
}
@@ -686,7 +686,7 @@ static Expected<std::vector<Function *>> DebugAMiscompilation(
outs() << "\n*** The following function"
<< (MiscompiledFunctions.size() == 1 ? " is" : "s are")
<< " being miscompiled: ";
- PrintFunctionList(MiscompiledFunctions);
+ printFunctionList(MiscompiledFunctions);
outs() << '\n';
}
}
@@ -708,7 +708,7 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
if (!Optimized) {
errs() << " Error running this sequence of passes"
<< " on the input program!\n";
- BD.EmitProgressBitcode(*Test, "pass-error", false);
+ BD.emitProgressBitcode(*Test, "pass-error", false);
BD.setNewProgram(std::move(Test));
if (Error E = BD.debugOptimizerCrash())
return std::move(E);
@@ -750,7 +750,7 @@ Error BugDriver::debugMiscompilation() {
outs() << "\n*** Found miscompiling pass"
<< (getPassesToRun().size() == 1 ? "" : "es") << ": "
<< getPassesString(getPassesToRun()) << '\n';
- EmitProgressBitcode(*Program, "passinput");
+ emitProgressBitcode(*Program, "passinput");
Expected<std::vector<Function *>> MiscompiledFunctions =
DebugAMiscompilation(*this, TestOptimizer);
@@ -762,15 +762,15 @@ Error BugDriver::debugMiscompilation() {
ValueToValueMapTy VMap;
Module *ToNotOptimize = CloneModule(getProgram(), VMap).release();
Module *ToOptimize =
- SplitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
+ splitFunctionsOutOfModule(ToNotOptimize, *MiscompiledFunctions, VMap)
.release();
outs() << " Non-optimized portion: ";
- EmitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
+ emitProgressBitcode(*ToNotOptimize, "tonotoptimize", true);
delete ToNotOptimize; // Delete hacked module.
outs() << " Portion that is input to optimizer: ";
- EmitProgressBitcode(*ToOptimize, "tooptimize");
+ emitProgressBitcode(*ToOptimize, "tooptimize");
delete ToOptimize; // Delete hacked module.
return Error::success();
@@ -1028,7 +1028,7 @@ Error BugDriver::debugCodeGenerator() {
ValueToValueMapTy VMap;
std::unique_ptr<Module> ToNotCodeGen = CloneModule(getProgram(), VMap);
std::unique_ptr<Module> ToCodeGen =
- SplitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
+ splitFunctionsOutOfModule(ToNotCodeGen.get(), *Funcs, VMap);
// Condition the modules
ToCodeGen =
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index 3daacfd..bf2e8c0 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -82,7 +82,7 @@ bool BugDriver::writeProgramToFile(const std::string &Filename,
/// This function is used to output the current Program to a file named
/// "bugpoint-ID.bc".
-void BugDriver::EmitProgressBitcode(const Module &M, const std::string &ID,
+void BugDriver::emitProgressBitcode(const Module &M, const std::string &ID,
bool NoFlyer) const {
// Output the input to the current pass to a bitcode file, emit a message
// telling the user how to reproduce it: opt -foo blah.bc
diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp
index f2f5966a..c67695f 100644
--- a/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/llvm/tools/bugpoint/ToolRunner.cpp
@@ -25,29 +25,25 @@ using namespace llvm;
#define DEBUG_TYPE "toolrunner"
-namespace llvm {
-cl::opt<bool> SaveTemps("save-temps", cl::init(false),
- cl::desc("Save temporary files"));
-}
+cl::opt<bool> llvm::SaveTemps("save-temps", cl::init(false),
+ cl::desc("Save temporary files"));
-namespace {
-cl::opt<std::string>
+static cl::opt<std::string>
RemoteClient("remote-client",
cl::desc("Remote execution client (rsh/ssh)"));
-cl::opt<std::string> RemoteHost("remote-host",
- cl::desc("Remote execution (rsh/ssh) host"));
+static cl::opt<std::string>
+ RemoteHost("remote-host", cl::desc("Remote execution (rsh/ssh) host"));
-cl::opt<std::string> RemotePort("remote-port",
- cl::desc("Remote execution (rsh/ssh) port"));
+static cl::opt<std::string>
+ RemotePort("remote-port", cl::desc("Remote execution (rsh/ssh) port"));
-cl::opt<std::string> RemoteUser("remote-user",
- cl::desc("Remote execution (rsh/ssh) user id"));
+static cl::opt<std::string>
+ RemoteUser("remote-user", cl::desc("Remote execution (rsh/ssh) user id"));
-cl::opt<std::string>
+static cl::opt<std::string>
RemoteExtra("remote-extra-options",
cl::desc("Remote execution (rsh/ssh) extra options"));
-}
/// RunProgramWithTimeout - This function provides an alternate interface
/// to the sys::Program::ExecuteAndWait interface.
@@ -160,7 +156,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> LLI::ExecuteProgram(const std::string &Bitcode,
const std::vector<std::string> &Args,
@@ -258,7 +254,7 @@ public:
inconvertibleErrorCode());
}
};
-}
+} // namespace
Error CustomCompiler::compileProgram(const std::string &Bitcode,
unsigned Timeout, unsigned MemoryLimit) {
@@ -301,7 +297,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> CustomExecutor::ExecuteProgram(
const std::string &Bitcode, const std::vector<std::string> &Args,
@@ -541,7 +537,7 @@ public:
const std::vector<std::string> &SharedLibs = std::vector<std::string>(),
unsigned Timeout = 0, unsigned MemoryLimit = 0) override;
};
-}
+} // namespace
Expected<int> JIT::ExecuteProgram(const std::string &Bitcode,
const std::vector<std::string> &Args,
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index 87581e80a..52ed135 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -90,7 +90,7 @@ public:
D.addPass(std::string(PI->getPassArgument()));
}
};
-}
+} // namespace
#define HANDLE_EXTENSION(Ext) \
llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 731d648..b7f898f 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -623,8 +623,9 @@ public:
});
}
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override {
- return InProcessMemoryMapper::prepare(Addr - DeltaAddr, ContentSize);
+ char *prepare(jitlink::LinkGraph &G, ExecutorAddr Addr,
+ size_t ContentSize) override {
+ return InProcessMemoryMapper::prepare(G, Addr - DeltaAddr, ContentSize);
}
void initialize(AllocInfo &AI, OnInitializedFunction OnInitialized) override {
diff --git a/llvm/unittests/ADT/BitFieldsTest.cpp b/llvm/unittests/ADT/BitFieldsTest.cpp
index 3062d5d..ae541fe 100644
--- a/llvm/unittests/ADT/BitFieldsTest.cpp
+++ b/llvm/unittests/ADT/BitFieldsTest.cpp
@@ -247,8 +247,8 @@ TEST(BitfieldsTest, ValueTooBigBounded) {
Bitfield::set<A>(Storage, 0);
Bitfield::set<A>(Storage, -1);
Bitfield::set<A>(Storage, -2);
- EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is too big");
- EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is too small");
+ EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, 2), "value is out of range");
+ EXPECT_DEBUG_DEATH(Bitfield::set<A>(Storage, -3), "value is out of range");
}
#endif
diff --git a/llvm/unittests/ADT/StringExtrasTest.cpp b/llvm/unittests/ADT/StringExtrasTest.cpp
index fbaed38..af88f889 100644
--- a/llvm/unittests/ADT/StringExtrasTest.cpp
+++ b/llvm/unittests/ADT/StringExtrasTest.cpp
@@ -290,6 +290,12 @@ TEST(StringExtrasTest, ListSeparator) {
EXPECT_EQ(S, "");
S = LS2;
EXPECT_EQ(S, " ");
+
+ ListSeparator LS3(",", "{");
+ S = LS3;
+ EXPECT_EQ(S, "{");
+ S = LS3;
+ EXPECT_EQ(S, ",");
}
TEST(StringExtrasTest, toStringAPInt) {
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index bcb1521..0fbf371 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -153,13 +153,14 @@ TEST(StringSwitchTest, EndsWithLower) {
}
TEST(StringSwitchTest, Cases) {
- enum class OSType { Windows, Linux, Unknown };
+ enum class OSType { Windows, Linux, MacOS, Unknown };
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
.Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
OSType::Windows)
.Cases("linux", "unix", "*nix", "posix", OSType::Linux)
+ .Cases({"macos", "osx"}, OSType::MacOS)
.Default(OSType::Unknown);
};
@@ -172,21 +173,26 @@ TEST(StringSwitchTest, Cases) {
EXPECT_EQ(OSType::Linux, Translate("*nix"));
EXPECT_EQ(OSType::Linux, Translate("posix"));
+ EXPECT_EQ(OSType::MacOS, Translate("macos"));
+ EXPECT_EQ(OSType::MacOS, Translate("osx"));
+
// Note that the whole null-terminator embedded string is required for the
// case to match.
EXPECT_EQ(OSType::Unknown, Translate("wind"));
EXPECT_EQ(OSType::Unknown, Translate("Windows"));
+ EXPECT_EQ(OSType::Unknown, Translate("MacOS"));
EXPECT_EQ(OSType::Unknown, Translate(""));
}
TEST(StringSwitchTest, CasesLower) {
- enum class OSType { Windows, Linux, Unknown };
+ enum class OSType { Windows, Linux, MacOS, Unknown };
auto Translate = [](StringRef S) {
return llvm::StringSwitch<OSType>(S)
.CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
OSType::Windows)
.CasesLower("linux", "unix", "*nix", "posix", OSType::Linux)
+ .CasesLower({"macos", "osx"}, OSType::MacOS)
.Default(OSType::Unknown);
};
@@ -202,6 +208,9 @@ TEST(StringSwitchTest, CasesLower) {
EXPECT_EQ(OSType::Windows, Translate(llvm::StringRef("wind\0ws", 7)));
EXPECT_EQ(OSType::Linux, Translate("linux"));
+ EXPECT_EQ(OSType::MacOS, Translate("macOS"));
+ EXPECT_EQ(OSType::MacOS, Translate("OSX"));
+
EXPECT_EQ(OSType::Unknown, Translate("wind"));
EXPECT_EQ(OSType::Unknown, Translate(""));
}
diff --git a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
index 1a68823..5d7eded 100644
--- a/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/llvm/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -11,6 +11,7 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/IR/Constants.h"
@@ -26,6 +27,8 @@
namespace llvm {
+using namespace SCEVPatternMatch;
+
// We use this fixture to ensure that we clean up ScalarEvolution before
// deleting the PassManager.
class ScalarEvolutionsTest : public testing::Test {
@@ -64,11 +67,6 @@ static std::optional<APInt> computeConstantDifference(ScalarEvolution &SE,
return SE.computeConstantDifference(LHS, RHS);
}
- static bool matchURem(ScalarEvolution &SE, const SCEV *Expr, const SCEV *&LHS,
- const SCEV *&RHS) {
- return SE.matchURem(Expr, LHS, RHS);
- }
-
static bool isImpliedCond(
ScalarEvolution &SE, ICmpInst::Predicate Pred, const SCEV *LHS,
const SCEV *RHS, ICmpInst::Predicate FoundPred, const SCEV *FoundLHS,
@@ -1524,7 +1522,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
auto *URemI = getInstructionByName(F, N);
auto *S = SE.getSCEV(URemI);
const SCEV *LHS, *RHS;
- EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+ EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
EXPECT_EQ(LHS, SE.getSCEV(URemI->getOperand(0)));
EXPECT_EQ(RHS, SE.getSCEV(URemI->getOperand(1)));
EXPECT_EQ(LHS->getType(), S->getType());
@@ -1537,7 +1535,7 @@ TEST_F(ScalarEvolutionsTest, MatchURem) {
auto *URem1 = getInstructionByName(F, "rem4");
auto *S = SE.getSCEV(Ext);
const SCEV *LHS, *RHS;
- EXPECT_TRUE(matchURem(SE, S, LHS, RHS));
+ EXPECT_TRUE(match(S, m_scev_URem(m_SCEV(LHS), m_SCEV(RHS), SE)));
EXPECT_NE(LHS, SE.getSCEV(URem1->getOperand(0)));
// RHS and URem1->getOperand(1) have different widths, so compare the
// integer values.
diff --git a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
index c5e9d43..a5269f7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MapperJITLinkMemoryManagerTest.cpp
@@ -39,8 +39,8 @@ public:
return Mapper->initialize(AI, std::move(OnInitialized));
}
- char *prepare(ExecutorAddr Addr, size_t ContentSize) override {
- return Mapper->prepare(Addr, ContentSize);
+ char *prepare(LinkGraph &G, ExecutorAddr Addr, size_t ContentSize) override {
+ return Mapper->prepare(G, Addr, ContentSize);
}
void deinitialize(ArrayRef<ExecutorAddr> Allocations,
diff --git a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
index fea9eab..1174493 100644
--- a/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/MemoryMapperTest.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
#include "llvm/Support/Process.h"
#include "llvm/Testing/Support/Error.h"
#include "gtest/gtest.h"
@@ -66,6 +67,9 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
{
std::unique_ptr<MemoryMapper> Mapper =
cantFail(InProcessMemoryMapper::Create());
+ jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(),
+ Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+ jitlink::getGenericEdgeKindName);
// We will do two separate allocations
auto PageSize = Mapper->getPageSize();
@@ -80,7 +84,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
{
// Provide working memory
- char *WA1 = Mapper->prepare(Mem1->Start, HW.size() + 1);
+ char *WA1 = Mapper->prepare(G, Mem1->Start, HW.size() + 1);
std::strcpy(WA1, HW.c_str());
}
@@ -105,7 +109,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
}
{
- char *WA2 = Mapper->prepare(Mem1->Start + PageSize, HW.size() + 1);
+ char *WA2 = Mapper->prepare(G, Mem1->Start + PageSize, HW.size() + 1);
std::strcpy(WA2, HW.c_str());
}
@@ -158,7 +162,7 @@ TEST(MemoryMapperTest, InitializeDeinitialize) {
auto Mem2 = reserve(*Mapper, PageSize);
EXPECT_THAT_ERROR(Mem2.takeError(), Succeeded());
- char *WA = Mapper->prepare(Mem2->Start, HW.size() + 1);
+ char *WA = Mapper->prepare(G, Mem2->Start, HW.size() + 1);
std::strcpy(WA, HW.c_str());
MemoryMapper::AllocInfo Alloc3;
diff --git a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
index 700500f..7775f3c 100644
--- a/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/SharedMemoryMapperTest.cpp
@@ -8,6 +8,7 @@
#include "OrcTestCommon.h"
#include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
#include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
#include "llvm/ExecutionEngine/Orc/SelfExecutorProcessControl.h"
#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
@@ -67,12 +68,16 @@ TEST(SharedMemoryMapperTest, MemReserveInitializeDeinitializeRelease) {
auto PageSize = Mapper->getPageSize();
size_t ReqSize = PageSize;
+ jitlink::LinkGraph G("G", std::make_shared<SymbolStringPool>(),
+ Triple("x86_64-apple-darwin"), SubtargetFeatures(),
+ jitlink::getGenericEdgeKindName);
Mapper->reserve(ReqSize, [&](Expected<ExecutorAddrRange> Result) {
EXPECT_THAT_ERROR(Result.takeError(), Succeeded());
auto Reservation = std::move(*Result);
{
- char *Addr = Mapper->prepare(Reservation.Start, TestString.size() + 1);
+ char *Addr =
+ Mapper->prepare(G, Reservation.Start, TestString.size() + 1);
std::strcpy(Addr, TestString.c_str());
}
MemoryMapper::AllocInfo AI;
diff --git a/llvm/unittests/IR/ConstantFPRangeTest.cpp b/llvm/unittests/IR/ConstantFPRangeTest.cpp
index cf9b31c..67fee96 100644
--- a/llvm/unittests/IR/ConstantFPRangeTest.cpp
+++ b/llvm/unittests/IR/ConstantFPRangeTest.cpp
@@ -8,6 +8,7 @@
#include "llvm/IR/ConstantFPRange.h"
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Operator.h"
#include "gtest/gtest.h"
@@ -1065,4 +1066,179 @@ TEST_F(ConstantFPRangeTest, sub) {
#endif
}
+TEST_F(ConstantFPRangeTest, mul) {
+ EXPECT_EQ(Full.mul(Full), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(Full.mul(Empty), Empty);
+ EXPECT_EQ(Empty.mul(Full), Empty);
+ EXPECT_EQ(Empty.mul(Empty), Empty);
+ EXPECT_EQ(One.mul(One), ConstantFPRange(APFloat(1.0)));
+ EXPECT_EQ(Some.mul(Some),
+ ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(9.0)));
+ EXPECT_EQ(SomePos.mul(SomeNeg),
+ ConstantFPRange::getNonNaN(APFloat(-9.0), APFloat(-0.0)));
+ EXPECT_EQ(PosInf.mul(PosInf), PosInf);
+ EXPECT_EQ(NegInf.mul(NegInf), PosInf);
+ EXPECT_EQ(PosInf.mul(Finite), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(NegInf.mul(Finite), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(PosInf.mul(NegInf), NegInf);
+ EXPECT_EQ(NegInf.mul(PosInf), NegInf);
+ EXPECT_EQ(PosZero.mul(NegZero), NegZero);
+ EXPECT_EQ(PosZero.mul(Zero), Zero);
+ EXPECT_EQ(NegZero.mul(NegZero), PosZero);
+ EXPECT_EQ(NegZero.mul(Zero), Zero);
+ EXPECT_EQ(NaN.mul(NaN), QNaN);
+ EXPECT_EQ(NaN.mul(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+ EnumerateTwoInterestingConstantFPRanges(
+ [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+ ConstantFPRange Res = LHS.mul(RHS);
+ ConstantFPRange Expected =
+ ConstantFPRange::getEmpty(LHS.getSemantics());
+ EnumerateValuesInConstantFPRange(
+ LHS,
+ [&](const APFloat &LHSC) {
+ EnumerateValuesInConstantFPRange(
+ RHS,
+ [&](const APFloat &RHSC) {
+ APFloat Prod = LHSC * RHSC;
+ EXPECT_TRUE(Res.contains(Prod))
+ << "Wrong result for " << LHS << " * " << RHS
+ << ". The result " << Res << " should contain " << Prod;
+ if (!Expected.contains(Prod))
+ Expected = Expected.unionWith(ConstantFPRange(Prod));
+ },
+ /*IgnoreNaNPayload=*/true);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(Res, Expected)
+ << "Suboptimal result for " << LHS << " * " << RHS << ". Expected "
+ << Expected << ", but got " << Res;
+ },
+ SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, div) {
+ EXPECT_EQ(Full.div(Full), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(Full.div(Empty), Empty);
+ EXPECT_EQ(Empty.div(Full), Empty);
+ EXPECT_EQ(Empty.div(Empty), Empty);
+ EXPECT_EQ(One.div(One), ConstantFPRange(APFloat(1.0)));
+ EXPECT_EQ(Some.div(Some), NonNaN.unionWith(QNaN));
+ EXPECT_EQ(SomePos.div(SomeNeg),
+ ConstantFPRange(APFloat::getInf(Sem, /*Negative=*/true),
+ APFloat::getZero(Sem, /*Negative=*/true),
+ /*MayBeQNaN=*/true, /*MayBeSNaN=*/false));
+ EXPECT_EQ(PosInf.div(PosInf), QNaN);
+ EXPECT_EQ(NegInf.div(NegInf), QNaN);
+ EXPECT_EQ(PosInf.div(Finite), NonNaN);
+ EXPECT_EQ(NegInf.div(Finite), NonNaN);
+ EXPECT_EQ(PosInf.div(NegInf), QNaN);
+ EXPECT_EQ(NegInf.div(PosInf), QNaN);
+ EXPECT_EQ(Zero.div(Zero), QNaN);
+ EXPECT_EQ(SomePos.div(PosInf), PosZero);
+ EXPECT_EQ(SomeNeg.div(PosInf), NegZero);
+ EXPECT_EQ(PosInf.div(SomePos), PosInf);
+ EXPECT_EQ(NegInf.div(SomeNeg), PosInf);
+ EXPECT_EQ(NegInf.div(Some), NonNaN);
+ EXPECT_EQ(NaN.div(NaN), QNaN);
+ EXPECT_EQ(NaN.div(Finite), QNaN);
+
+#if defined(EXPENSIVE_CHECKS)
+ EnumerateTwoInterestingConstantFPRanges(
+ [](const ConstantFPRange &LHS, const ConstantFPRange &RHS) {
+ ConstantFPRange Res = LHS.div(RHS);
+ ConstantFPRange Expected =
+ ConstantFPRange::getEmpty(LHS.getSemantics());
+ EnumerateValuesInConstantFPRange(
+ LHS,
+ [&](const APFloat &LHSC) {
+ EnumerateValuesInConstantFPRange(
+ RHS,
+ [&](const APFloat &RHSC) {
+ APFloat Val = LHSC / RHSC;
+ EXPECT_TRUE(Res.contains(Val))
+ << "Wrong result for " << LHS << " / " << RHS
+ << ". The result " << Res << " should contain " << Val;
+ if (!Expected.contains(Val))
+ Expected = Expected.unionWith(ConstantFPRange(Val));
+ },
+ /*IgnoreNaNPayload=*/true);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(Res, Expected)
+ << "Suboptimal result for " << LHS << " / " << RHS << ". Expected "
+ << Expected << ", but got " << Res;
+ },
+ SparseLevel::SpecialValuesOnly);
+#endif
+}
+
+TEST_F(ConstantFPRangeTest, flushDenormals) {
+ const fltSemantics &FP8Sem = APFloat::Float8E4M3();
+ APFloat NormalVal = APFloat::getSmallestNormalized(FP8Sem);
+ APFloat Subnormal1 = NormalVal;
+ Subnormal1.next(/*nextDown=*/true);
+ APFloat Subnormal2 = APFloat::getSmallest(FP8Sem);
+ APFloat ZeroVal = APFloat::getZero(FP8Sem);
+ APFloat EdgeValues[8] = {-NormalVal, -Subnormal1, -Subnormal2, -ZeroVal,
+ ZeroVal, Subnormal2, Subnormal1, NormalVal};
+ constexpr DenormalMode::DenormalModeKind Modes[4] = {
+ DenormalMode::IEEE, DenormalMode::PreserveSign,
+ DenormalMode::PositiveZero, DenormalMode::Dynamic};
+ for (uint32_t I = 0; I != 8; ++I) {
+ for (uint32_t J = I; J != 8; ++J) {
+ ConstantFPRange OriginCR =
+ ConstantFPRange::getNonNaN(EdgeValues[I], EdgeValues[J]);
+ for (auto Mode : Modes) {
+ StringRef ModeName = denormalModeKindName(Mode);
+ ConstantFPRange FlushedCR = OriginCR;
+ FlushedCR.flushDenormals(Mode);
+
+ ConstantFPRange Expected = ConstantFPRange::getEmpty(FP8Sem);
+ auto CheckFlushedV = [&](const APFloat &V, const APFloat &FlushedV) {
+ EXPECT_TRUE(FlushedCR.contains(FlushedV))
+ << "Wrong result for flushDenormal(" << V << ", " << ModeName
+ << "). The result " << FlushedCR << " should contain "
+ << FlushedV;
+ if (!Expected.contains(FlushedV))
+ Expected = Expected.unionWith(ConstantFPRange(FlushedV));
+ };
+ EnumerateValuesInConstantFPRange(
+ OriginCR,
+ [&](const APFloat &V) {
+ if (V.isDenormal()) {
+ switch (Mode) {
+ case DenormalMode::IEEE:
+ break;
+ case DenormalMode::PreserveSign:
+ CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+ break;
+ case DenormalMode::PositiveZero:
+ CheckFlushedV(V, APFloat::getZero(FP8Sem));
+ break;
+ case DenormalMode::Dynamic:
+ // PreserveSign
+ CheckFlushedV(V, APFloat::getZero(FP8Sem, V.isNegative()));
+ // PositiveZero
+ CheckFlushedV(V, APFloat::getZero(FP8Sem));
+ break;
+ default:
+ llvm_unreachable("unknown denormal mode");
+ }
+ }
+ // It is not mandated that flushing to zero occurs.
+ CheckFlushedV(V, V);
+ },
+ /*IgnoreNaNPayload=*/true);
+ EXPECT_EQ(FlushedCR, Expected)
+ << "Suboptimal result for flushDenormal(" << OriginCR << ", "
+ << ModeName << "). Expected " << Expected << ", but got "
+ << FlushedCR;
+ }
+ }
+ }
+}
+
} // anonymous namespace
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index fe9e7e8..f4693bf 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -606,12 +606,14 @@ TEST(InstructionTest, ConstrainedTrans) {
TEST(InstructionsTest, isEliminableCastPair) {
LLVMContext C;
- DataLayout DL1("p1:32:32");
+ DataLayout DL1("p1:32:32-p2:64:64:64:32");
Type *Int16Ty = Type::getInt16Ty(C);
+ Type *Int32Ty = Type::getInt32Ty(C);
Type *Int64Ty = Type::getInt64Ty(C);
Type *PtrTy64 = PointerType::get(C, 0);
Type *PtrTy32 = PointerType::get(C, 1);
+ Type *PtrTy64_32 = PointerType::get(C, 2);
// Source and destination pointers have same size -> bitcast.
EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
@@ -637,6 +639,42 @@ TEST(InstructionsTest, isEliminableCastPair) {
Int64Ty, &DL1),
0U);
+ // Destination larger than source. Pointer type same as destination.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int16Ty, PtrTy64,
+ Int64Ty, &DL1),
+ CastInst::ZExt);
+
+ // Destination larger than source. Pointer type different from destination.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int16Ty, PtrTy32,
+ Int64Ty, &DL1),
+ CastInst::ZExt);
+
+ // Destination smaller than source. Pointer type same as source.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int64Ty, PtrTy64,
+ Int16Ty, &DL1),
+ CastInst::Trunc);
+
+ // Destination smaller than source. Pointer type different from source.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToInt, Int64Ty, PtrTy32,
+ Int16Ty, &DL1),
+ CastInst::Trunc);
+
+ // ptrtoaddr with address size != pointer size. Truncating case.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToAddr, Int64Ty,
+ PtrTy64_32, Int32Ty, &DL1),
+ CastInst::Trunc);
+
+ // ptrtoaddr with address size != pointer size. Non-truncating case.
+ EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+ CastInst::PtrToAddr, Int32Ty,
+ PtrTy64_32, Int32Ty, &DL1),
+ CastInst::BitCast);
+
// Test that we don't eliminate bitcasts between different address spaces,
// or if we don't have available pointer size information.
DataLayout DL2("e-p:32:32:32-p1:16:16:16-p2:64:64:64-i1:8:8-i8:8:8-i16:16:16"
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
index 25efa00..21f10eb 100644
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -44,6 +44,7 @@ add_llvm_unittest(SupportTests
ExtensibleRTTITest.cpp
FileCollectorTest.cpp
FileOutputBufferTest.cpp
+ Format.cpp
FormatVariadicTest.cpp
FSUniqueIDTest.cpp
GenericDomTreeTest.cpp
diff --git a/llvm/unittests/Support/Format.cpp b/llvm/unittests/Support/Format.cpp
new file mode 100644
index 0000000..c4e421f
--- /dev/null
+++ b/llvm/unittests/Support/Format.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Format.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+template <typename FormatTy>
+std::string printToString(unsigned MaxN, FormatTy &&Fmt) {
+ std::vector<char> Dst(MaxN + 2);
+ int N = Fmt.snprint(Dst.data(), Dst.size());
+ Dst.back() = 0;
+ return N < 0 ? "" : Dst.data();
+}
+
+template <typename Expected, typename Arg>
+constexpr bool checkDecayTypeEq(const Arg &arg) {
+ return std::is_same_v<detail::decay_if_c_char_array_t<Arg>, Expected>;
+}
+
+TEST(Format, DecayIfCCharArray) {
+ char Array[] = "Array";
+ const char ConstArray[] = "ConstArray";
+ char PtrBuf[] = "Ptr";
+ char *Ptr = PtrBuf;
+ const char *PtrToConst = "PtrToConst";
+
+ EXPECT_EQ(" Literal", printToString(20, format("%15s", "Literal")));
+ EXPECT_EQ(" Array", printToString(20, format("%15s", Array)));
+ EXPECT_EQ(" ConstArray", printToString(20, format("%15s", ConstArray)));
+ EXPECT_EQ(" Ptr", printToString(20, format("%15s", Ptr)));
+ EXPECT_EQ(" PtrToConst", printToString(20, format("%15s", PtrToConst)));
+
+ EXPECT_TRUE(checkDecayTypeEq<const char *>("Literal"));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(Array));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(ConstArray));
+ EXPECT_TRUE(checkDecayTypeEq<char *>(Ptr));
+ EXPECT_TRUE(checkDecayTypeEq<const char *>(PtrToConst));
+ EXPECT_TRUE(checkDecayTypeEq<char>(PtrToConst[0]));
+ EXPECT_TRUE(
+ checkDecayTypeEq<const char *>(static_cast<const char *>("Literal")));
+
+ wchar_t WCharArray[] = L"WCharArray";
+ EXPECT_TRUE(checkDecayTypeEq<wchar_t[11]>(WCharArray));
+ EXPECT_TRUE(checkDecayTypeEq<wchar_t>(WCharArray[0]));
+}
+
+} // namespace
diff --git a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
index c74d157..5ac4c53 100644
--- a/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
+++ b/llvm/unittests/Target/AArch64/AArch64SelectionDAGTest.cpp
@@ -177,6 +177,57 @@ TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_VASHR) {
EXPECT_EQ(DAG->ComputeNumSignBits(Fr2), 5u);
}
+TEST_F(AArch64SelectionDAGTest, ComputeNumSignBits_SUB) {
+ SDLoc Loc;
+ auto IntVT = EVT::getIntegerVT(Context, 8);
+ auto N0 = DAG->getConstant(0x00, Loc, IntVT);
+ auto N1 = DAG->getConstant(0x01, Loc, IntVT);
+ auto N5 = DAG->getConstant(0x05, Loc, IntVT);
+ auto Nsign1 = DAG->getConstant(0x55, Loc, IntVT);
+ auto UnknownOp = DAG->getRegister(0, IntVT);
+ auto Mask = DAG->getConstant(0x1e, Loc, IntVT);
+ auto Nsign3 = DAG->getNode(ISD::AND, Loc, IntVT, Mask, UnknownOp);
+ // RHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpRhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, Nsign1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpRhsEo), 1u);
+
+ // Neg 0
+ // N0 = 00000000
+ auto OpNegZero = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N0);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegZero), 8u);
+
+ // Neg 1
+ // N0 = 00000000
+ // N1 = 00000001
+ auto OpNegOne = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N1);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegOne), 8u);
+
+ // Neg 5
+ // N0 = 00000000
+ // N5 = 00000101
+ auto OpNegFive = DAG->getNode(ISD::SUB, Loc, IntVT, N0, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNegFive), 5u);
+
+ // Non negative
+ // N0 = 00000000
+ // Nsign3 = 000????0
+ auto OpNonNeg = DAG->getNode(ISD::SUB, Loc, IntVT, N0, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpNonNeg), 3u);
+
+ // LHS early out
+ // Nsign1 = 01010101
+ // Nsign3 = 000????0
+ auto OpLhsEo = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign1, Nsign3);
+ EXPECT_EQ(DAG->ComputeNumSignBits(OpLhsEo), 1u);
+
+ // Nsign3 = 000????0
+ // N5 = 00000101
+ auto Op = DAG->getNode(ISD::SUB, Loc, IntVT, Nsign3, N5);
+ EXPECT_EQ(DAG->ComputeNumSignBits(Op), 2u);
+}
+
TEST_F(AArch64SelectionDAGTest, SimplifyDemandedVectorElts_EXTRACT_SUBVECTOR) {
TargetLowering TL(*TM);
diff --git a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
index 841f44c..716f5f2 100644
--- a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
@@ -308,3 +308,223 @@ TEST(SSAUpdaterBulk, TwoBBLoop) {
EXPECT_EQ(Phi->getIncomingValueForBlock(Entry), ConstantInt::get(I32Ty, 0));
EXPECT_EQ(Phi->getIncomingValueForBlock(Loop), I);
}
+
+TEST(SSAUpdaterBulk, SimplifyPHIs) {
+ const char *IR = R"(
+ define void @main(i32 %val, i1 %cond) {
+ entry:
+ br i1 %cond, label %left, label %right
+ left:
+ %add = add i32 %val, 1
+ br label %exit
+ right:
+ %sub = sub i32 %val, 1
+ br label %exit
+ exit:
+ %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+ %cmp = icmp slt i32 0, 42
+ ret void
+ }
+ )";
+
+ llvm::LLVMContext Context;
+ llvm::SMDiagnostic Err;
+ std::unique_ptr<llvm::Module> M = llvm::parseAssemblyString(IR, Err, Context);
+ ASSERT_NE(M, nullptr) << "Failed to parse IR: " << Err.getMessage();
+
+ Function *F = M->getFunction("main");
+ auto *Entry = &F->getEntryBlock();
+ auto *Left = Entry->getTerminator()->getSuccessor(0);
+ auto *Right = Entry->getTerminator()->getSuccessor(1);
+ auto *Exit = Left->getSingleSuccessor();
+ auto *Val = &*F->arg_begin();
+ auto *Phi = &Exit->front();
+ auto *Cmp = &*std::next(Exit->begin());
+ auto *Add = &Left->front();
+ auto *Sub = &Right->front();
+
+ SSAUpdaterBulk Updater;
+ Type *I32Ty = Type::getInt32Ty(Context);
+
+ // Use %val directly instead of creating a phi.
+ unsigned ValVar = Updater.AddVariable("Val", I32Ty);
+ Updater.AddAvailableValue(ValVar, Left, Val);
+ Updater.AddAvailableValue(ValVar, Right, Val);
+ Updater.AddUse(ValVar, &Cmp->getOperandUse(0));
+
+ // Use existing %phi for %add and %sub values.
+ unsigned AddSubVar = Updater.AddVariable("AddSub", I32Ty);
+ Updater.AddAvailableValue(AddSubVar, Left, Add);
+ Updater.AddAvailableValue(AddSubVar, Right, Sub);
+ Updater.AddUse(AddSubVar, &Cmp->getOperandUse(1));
+
+ auto ExitSizeBefore = Exit->size();
+ DominatorTree DT(*F);
+ Updater.RewriteAndOptimizeAllUses(DT);
+
+ // Output for Exit->dump():
+ // exit: ; preds = %right, %left
+ // %phi = phi i32 [ %sub, %right ], [ %add, %left ]
+ // %cmp = icmp slt i32 %val, %phi
+ // ret void
+
+ ASSERT_EQ(Exit->size(), ExitSizeBefore);
+ ASSERT_EQ(&Exit->front(), Phi);
+ EXPECT_EQ(Val, Cmp->getOperand(0));
+ EXPECT_EQ(Phi, Cmp->getOperand(1));
+}
+
+bool EliminateNewDuplicatePHINodes(BasicBlock *BB,
+ BasicBlock::phi_iterator FirstExistingPN);
+
+// Helper to run both versions on the same input.
+static void RunEliminateNewDuplicatePHINode(
+ const char *AsmText,
+ std::function<void(BasicBlock &,
+ bool(BasicBlock *BB, BasicBlock::phi_iterator))>
+ Check) {
+ LLVMContext C;
+
+ SMDiagnostic Err;
+ std::unique_ptr<Module> M = parseAssemblyString(AsmText, Err, C);
+ if (!M) {
+ Err.print("UtilsTests", errs());
+ return;
+ }
+
+ Function *F = M->getFunction("main");
+ auto BBIt = std::find_if(F->begin(), F->end(), [](const BasicBlock &Block) {
+ return Block.getName() == "testbb";
+ });
+ ASSERT_NE(BBIt, F->end());
+ Check(*BBIt, EliminateNewDuplicatePHINodes);
+}
+
+static BasicBlock::phi_iterator getPhiIt(BasicBlock &BB, unsigned Idx) {
+ return std::next(BB.phis().begin(), Idx);
+}
+
+static PHINode *getPhi(BasicBlock &BB, unsigned Idx) {
+ return &*getPhiIt(BB, Idx);
+}
+
+static int getNumPHIs(BasicBlock &BB) {
+ return std::distance(BB.phis().begin(), BB.phis().end());
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderExisting) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ]
+ %np1 = phi i32 [ 1, %entry ]
+ %ep0 = phi i32 [ 1, %entry ]
+ %ep1 = phi i32 [ 1, %entry ]
+ %u = add i32 %np0, %np1
+ ret void
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ]
+ // %ep1 = phi i32 [ 1, %entry ]
+ // %u = add i32 %ep0, %ep0
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP0);
+ (void)EP1; // Avoid "unused" warning.
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_OrderNew) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ]
+ %np1 = phi i32 [ 1, %entry ]
+ %ep0 = phi i32 [ 2, %entry ]
+ %ep1 = phi i32 [ 2, %entry ]
+ %u = add i32 %np0, %np1
+ ret void
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> NP0 = getPhi(BB, 0);
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %np0 = phi i32 [ 1, %entry ]
+ // %ep0 = phi i32 [ 2, %entry ]
+ // %ep1 = phi i32 [ 2, %entry ]
+ // %u = add i32 %np0, %np0
+ EXPECT_EQ(getNumPHIs(BB), 3);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), NP0);
+ EXPECT_EQ(Add.getOperand(1), NP0);
+ (void)EP0;
+ (void)EP1; // Avoid "unused" warning.
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_NewRefExisting) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ %np1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ %u = add i32 %np0, %np1
+ br label %testbb
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ // %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ // %u = add i32 %ep0, %ep1
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP1);
+ });
+}
+
+TEST(SSAUpdaterBulk, EliminateNewDuplicatePHINodes_ExistingRefNew) {
+ RunEliminateNewDuplicatePHINode(R"(
+ define void @main() {
+ entry:
+ br label %testbb
+ testbb:
+ %np0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+ %np1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+ %ep0 = phi i32 [ 1, %entry ], [ %np0, %testbb ]
+ %ep1 = phi i32 [ 1, %entry ], [ %np1, %testbb ]
+ %u = add i32 %np0, %np1
+ br label %testbb
+ }
+ )", [](BasicBlock &BB, auto *ENDPN) {
+ AssertingVH<PHINode> EP0 = getPhi(BB, 2);
+ AssertingVH<PHINode> EP1 = getPhi(BB, 3);
+ EXPECT_TRUE(ENDPN(&BB, getPhiIt(BB, 2)));
+ // Expected:
+ // %ep0 = phi i32 [ 1, %entry ], [ %ep0, %testbb ]
+ // %ep1 = phi i32 [ 1, %entry ], [ %ep1, %testbb ]
+ // %u = add i32 %ep0, %ep1
+ EXPECT_EQ(getNumPHIs(BB), 2);
+ Instruction &Add = *BB.getFirstNonPHIIt();
+ EXPECT_EQ(Add.getOperand(0), EP0);
+ EXPECT_EQ(Add.getOperand(1), EP1);
+ });
+}
diff --git a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
index 55b68f5..2a0f500 100644
--- a/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPDomTreeTest.cpp
@@ -45,8 +45,7 @@ TEST_F(VPDominatorTreeTest, DominanceNoRegionsTest) {
VPBlockUtils::connectBlocks(R1, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
EXPECT_TRUE(VPDT.dominates(VPBB1, VPBB4));
EXPECT_FALSE(VPDT.dominates(VPBB4, VPBB1));
@@ -118,8 +117,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
VPBlockUtils::connectBlocks(R1, R2);
VPBlockUtils::connectBlocks(R2, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
checkDomChildren(VPDT, R1, {R1BB1});
checkDomChildren(VPDT, R1BB1, {R1BB2, R1BB4, R1BB3});
@@ -197,8 +195,7 @@ TEST_F(VPDominatorTreeTest, DominanceRegionsTest) {
VPBlockUtils::connectBlocks(R1, VPBB2);
VPBlockUtils::connectBlocks(VPBB2, Plan.getScalarHeader());
- VPDominatorTree VPDT;
- VPDT.recalculate(Plan);
+ VPDominatorTree VPDT(Plan);
checkDomChildren(VPDT, VPBB1, {R1});
checkDomChildren(VPDT, R1, {R1BB1});
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
index 5f9eb9a..fe212d1 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/BUILD.gn
@@ -5,6 +5,7 @@ static_library("Analysis") {
"//clang/include/clang/AST:StmtDataCollectors",
"//clang/lib/AST",
"//clang/lib/ASTMatchers",
+ "//clang/lib/Analysis/LifetimeSafety",
"//clang/lib/Basic",
"//clang/lib/Lex",
"//llvm/lib/Support",
@@ -27,8 +28,6 @@ static_library("Analysis") {
"FixitUtil.cpp",
"IntervalPartition.cpp",
"IssueHash.cpp",
- "LifetimeAnnotations.cpp",
- "LifetimeSafety.cpp",
"LiveVariables.cpp",
"MacroExpansionContext.cpp",
"ObjCNoReturn.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
new file mode 100644
index 0000000..7f962c4
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/LifetimeSafety/BUILD.gn
@@ -0,0 +1,20 @@
+static_library("LifetimeSafety") {
+ output_name = "clangAnalysisLifetimeSafety"
+ configs += [ "//llvm/utils/gn/build:clang_code" ]
+ deps = [
+ "//clang/lib/AST",
+ "//clang/lib/Basic",
+ "//llvm/lib/Support",
+ ]
+ sources = [
+ "Checker.cpp",
+ "Facts.cpp",
+ "FactsGenerator.cpp",
+ "LifetimeAnnotations.cpp",
+ "LifetimeSafety.cpp",
+ "LiveOrigins.cpp",
+ "LoanPropagation.cpp",
+ "Loans.cpp",
+ "Origins.cpp",
+ ]
+}
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index 1afd342..c9f3a074 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -31,6 +31,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
"LoggerTest.cpp",
"MapLatticeTest.cpp",
"MatchSwitchTest.cpp",
+ "MockHeaders.cpp",
"MultiVarConstantPropagationTest.cpp",
"RecordOpsTest.cpp",
"SignAnalysisTest.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
index 0b9282e..d5a25f9 100644
--- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
@@ -33,6 +33,7 @@ unittest("StaticAnalysisTests") {
"StoreTest.cpp",
"SymbolReaperTest.cpp",
"TestReturnValueUnderConstruction.cpp",
+ "UnsignedStatDemo.cpp",
"Z3CrosscheckOracleTest.cpp",
]
}
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
index a25f058..4553968 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn
@@ -48,6 +48,7 @@ unittest("SupportTests") {
"FSUniqueIDTest.cpp",
"FileCollectorTest.cpp",
"FileOutputBufferTest.cpp",
+ "Format.cpp",
"FormatVariadicTest.cpp",
"GenericDomTreeTest.cpp",
"GlobPatternTest.cpp",
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index bdcb8a3..343c2bb71 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -1129,6 +1129,7 @@ Transforms/LowerIFunc/ifunc-alias.ll
Transforms/LowerIFunc/ifunc-nonsense-resolvers.ll
Transforms/LowerIFunc/ifunc-program-addrspace.ll
Transforms/LowerIFunc/lower-ifunc.ll
+Transforms/LowerMatrixIntrinsics/data-layout-multiply-fused.ll
Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
Transforms/LowerMatrixIntrinsics/multiply-fused.ll
Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -1311,82 +1312,6 @@ Transforms/SimpleLoopUnswitch/pr60736.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch-freeze-individual-conditions.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
-Transforms/SLPVectorizer/AArch64/gather-root.ll
-Transforms/SLPVectorizer/AArch64/horizontal.ll
-Transforms/SLPVectorizer/AArch64/loadi8.ll
-Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/AArch64/uselistorder.ll
-Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-min-max.ll
-Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
-Transforms/SLPVectorizer/AMDGPU/horizontal-store.ll
-Transforms/SLPVectorizer/bool-logical-op-reduction-with-poison.ll
-Transforms/SLPVectorizer/call-arg-reduced-by-minbitwidth.ll
-Transforms/SLPVectorizer/const-bool-logical-or-reduction.ll
-Transforms/SLPVectorizer/extracts-with-undefs.ll
-Transforms/SLPVectorizer/freeze-signedness-missed.ll
-Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll
-Transforms/SLPVectorizer/gather_extract_from_vectorbuild.ll
-Transforms/SLPVectorizer/insert-element-build-vector-const.ll
-Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
-Transforms/SLPVectorizer/insert-element-build-vector.ll
-Transforms/SLPVectorizer/logical-ops-poisonous-repeated.ll
-Transforms/SLPVectorizer/minbitwidth-node-with-multi-users.ll
-Transforms/SLPVectorizer/minbitwidth-user-not-min.ll
-Transforms/SLPVectorizer/partial-register-extract.ll
-Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
-Transforms/SLPVectorizer/reorder-node.ll
-Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
-Transforms/SLPVectorizer/revec.ll
-Transforms/SLPVectorizer/RISCV/remarks_cmp_sel_min_max.ll
-Transforms/SLPVectorizer/RISCV/remarks-insert-into-small-vector.ll
-Transforms/SLPVectorizer/RISCV/reordered-interleaved-loads.ll
-Transforms/SLPVectorizer/RISCV/revec.ll
-Transforms/SLPVectorizer/RISCV/select-profitability.ll
-Transforms/SLPVectorizer/RISCV/shuffled-gather-casted.ll
-Transforms/SLPVectorizer/RISCV/unsigned-node-trunc-with-signed-users.ll
-Transforms/SLPVectorizer/slp-deleted-inst.ll
-Transforms/SLPVectorizer/SystemZ/cmp-ptr-minmax.ll
-Transforms/SLPVectorizer/SystemZ/ext-not-resized-op-resized.ll
-Transforms/SLPVectorizer/SystemZ/minbitwidth-trunc.ll
-Transforms/SLPVectorizer/X86/bool-mask.ll
-Transforms/SLPVectorizer/X86/bv-root-part-of-graph.ll
-Transforms/SLPVectorizer/X86/cmp-after-intrinsic-call-minbitwidth.ll
-Transforms/SLPVectorizer/X86/cmp-as-alternate-ops.ll
-Transforms/SLPVectorizer/X86/cmp_sel.ll
-Transforms/SLPVectorizer/X86/crash_7zip.ll
-Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
-Transforms/SLPVectorizer/X86/crash_cmpop.ll
-Transforms/SLPVectorizer/X86/debug-counter.ll
-Transforms/SLPVectorizer/X86/debug-info-salvage.ll
-Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll
-Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
-Transforms/SLPVectorizer/X86/ext-used-scalar-different-bitwidth.ll
-Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll
-Transforms/SLPVectorizer/X86/horizontal-minmax.ll
-Transforms/SLPVectorizer/X86/insert-after-bundle.ll
-Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll
-Transforms/SLPVectorizer/X86/minbitwidth-icmp-to-trunc.ll
-Transforms/SLPVectorizer/X86/minbw-user-non-sizable.ll
-Transforms/SLPVectorizer/X86/non-load-reduced-as-part-of-bv.ll
-Transforms/SLPVectorizer/X86/ordering-bug.ll
-Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
-Transforms/SLPVectorizer/X86/phi-node-reshuffled-part.ll
-Transforms/SLPVectorizer/X86/pr46983.ll
-Transforms/SLPVectorizer/X86/pr49933.ll
-Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
-Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
-Transforms/SLPVectorizer/X86/reduction-logical.ll
-Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
-Transforms/SLPVectorizer/X86/reused-reductions-with-minbitwidth.ll
-Transforms/SLPVectorizer/X86/select-reduction-op.ll
-Transforms/SLPVectorizer/X86/shrink_after_reorder.ll
-Transforms/SLPVectorizer/X86/subvector-minbitwidth-unsigned-value.ll
-Transforms/SLPVectorizer/X86/undef_vect.ll
-Transforms/SLPVectorizer/X86/used-reduced-op.ll
-Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll
-Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll
-Transforms/SLPVectorizer/X86/whole-registers-compare.ll
Transforms/SROA/addrspacecast.ll
Transforms/SROA/phi-and-select.ll
Transforms/SROA/phi-gep.ll